编写混合MPI / OpenACC计划

时间:2015-10-14 16:22:53

标签: c cuda mpi openacc

我正在尝试编写混合MPI / OpenACC代码,其中代码需要执行8个不同的作业(在这种情况下,8个不同的扫描)。使用MPI将这8个作业划分为[1-8]个进程/节点,并使用OpenACC并行化8个作业中需要进行的计算。

每个过程完成计算后,我减少解决方案并将最小值传递给过程0,这是最终的解决方案。

下面是生成.txt输出文件的完整代码(test.c)的MCVE

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include "mpi.h"

#define min(a,b) (a > b) ? b : a
#define max(a,b) (a < b) ? b : a

#define NPES 8 // max number of PEs allowed
#define DEFAULT_BORDER_LOCATION   -1
#define DEFAULT_BORDER_DISTANCE   INFINITY
#define DEFAULT_INTERIOR_DISTANCE 90000

typedef struct {
  int order;
  int firstLevel, lastLevel, level;
  int xDim, yDim, zDim;
  int xSweepOff, ySweepOff, zSweepOff;
  double dx, dy, dz;
} SweepInfo;

typedef struct {
  double   dx, dy, dz;
  int * location;
  double * distance;
} Phi;

typedef struct {
  int x, y, z;
} Grid3D;



void calc_dist_field( Phi * p, int totalNodes );
void write_to_file(double * dist);
static SweepInfo make_sweepInfo( Phi * p, int my_rank );
static void fast_sweep( Phi * p, SweepInfo * s );
static double solveEikonal(Phi * p, int index, int max_x, int max_y);
static void update_distance(Phi * p, int totalNodes);
static void set_distance_negative_inside(Phi * p, int totalNodes);
static void adjust_boundary( Phi * p );


// public method declarations
Grid3D make_grid3D(int x, int y, int z);
void vti_get_dimensions(FILE *vti, double *d);
void vti_get_data(FILE *vti, int *l, int b_l, double *d, double b_d, Grid3D g);

// private method declarations
static void move_file_pointer(FILE *file_ptr, int lineNumber, int r);
static void get_location(FILE *vti, int *l, int b_l, Grid3D g);
static void get_distance(FILE *vti, double *d, double b_d, Grid3D g);

static int npes;          // Number of PEs
static int my_rank;       // Rank of the PE
static char * fileName;
static char * outfileName;

static int NX, NY, NZ, totalNodes;

int main(int argc, char *argv[]) {

  // MPI startup routine
  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &npes);

  fileName    = argv[1];
  outfileName = argv[2];
  FILE *f = fopen(fileName, "r");

  double dims[6];
  vti_get_dimensions(f, dims);
  NX = dims[0] + 3;
  NY = dims[1] + 3;
  NZ = dims[2] + 3;

  totalNodes = NX * NY * NZ;
  Phi *p = (Phi *) malloc(sizeof(Phi));
  p->location = (int *) malloc(sizeof(int) * totalNodes);
  p->distance = (double *) malloc(sizeof(double) * totalNodes);

  p->dx = dims[3]; p->dy = dims[4]; p->dz = dims[5];

  vti_get_data( f, p->location, DEFAULT_BORDER_LOCATION,
                   p->distance, DEFAULT_BORDER_DISTANCE,
                make_grid3D(NX, NY, NZ));


  update_distance(p, totalNodes);

  calc_dist_field(p, totalNodes);

  MPI_Finalize();
  return 0;
}

void calc_dist_field( Phi * p, int totalNodes ) {

  int sweepNumber = my_rank + 1;
  double * tmp_dist;

  MPI_Barrier(MPI_COMM_WORLD);
  if(my_rank == 0){
    tmp_dist = (double *) malloc( totalNodes * sizeof(double) );
  }

  // sn represents the sweep number
  for( int sn = sweepNumber; sn <= NPES; sn += npes) {
    SweepInfo s = make_sweepInfo(p, sn);

    printf("PE: [%d] - performing sweep number ..... [%d/%d]\n", my_rank, sn, NPES);

    fast_sweep(p, &s);

    printf("PE: [%d] - completed sweep number ...... [%d/%d]\n", my_rank, sn, NPES);

  }

  MPI_Barrier(MPI_COMM_WORLD);
  #pragma acc update host(p->distance[0:totalNodes])
  MPI_Reduce(p->distance, tmp_dist, totalNodes, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);

  if( my_rank == 0 ) {
    free( p->distance );
    p->distance = tmp_dist;
    set_distance_negative_inside(p, totalNodes);
    adjust_boundary(p);
    write_to_file(p->distance);
    printf("%s file created\n", outfileName);
  }

}

static void update_distance(Phi * p, int totalNodes) {

  int    *l = &p->location[0];
  double *d = &p->distance[0];

  for(int i = 0; i < totalNodes; i++) {
    if(*l != DEFAULT_BORDER_LOCATION && *d != DEFAULT_BORDER_DISTANCE ) {
      *d = (*l == 1 && *d == INFINITY) ? -1 : (*d > 0.0 || *d < 0.0) ? *d : DEFAULT_INTERIOR_DISTANCE;
    }
    l++; d++;
  }

}



void write_to_file(double * dist) {

  int x = NX;
  int y = NY;
  int z = NZ;
  char fname[255];
  sprintf(fname, "%s.txt", outfileName);
  FILE *fp = fopen(fname, "w");

  int i,j,k;
  double *t = &dist[0];
  for(i = 0; i < z; i++){
    for(j = 0; j < y; j++){
      for(k = 0; k < x; k++) {
    fprintf(fp, "%f  ", *(t++));
      }
      fprintf(fp, "\n");
    }
    fprintf(fp, "\n");
  }

}


static SweepInfo make_sweepInfo( Phi * p, int my_rank ) {
  SweepInfo s;

  s.order      = my_rank;
  s.firstLevel = 3;
  s.lastLevel  = (NX + NY + NZ) - 6;

  s.xDim = NX-2; s.dx = p->dx;
  s.yDim = NY-2; s.dy = p->dy;
  s.zDim = NZ-2; s.dz = p->dz;

  s.xSweepOff = (s.order == 4 || s.order == 8 ) ? s.xDim + 1 : 0;
  s.ySweepOff = (s.order == 2 || s.order == 6 ) ? s.yDim + 1 : 0;
  s.zSweepOff = (s.order == 3 || s.order == 7 ) ? s.zDim + 1 : 0;

  return s;
}

static void fast_sweep( Phi * p, SweepInfo * s ) {

  int start, end, incr;

  start = ( s->order == 2 || s->order == 5 || s->order == 7 || s->order == 8 ) ? s->lastLevel : s->firstLevel;

  if ( start == s->firstLevel ) {
    end  = s->lastLevel + 1;
    incr = 1;
  }
  else {
    end  = s->firstLevel - 1;
    incr = 0;
  }

  int max_x  = s->xDim + 2;
  int max_y  = s->yDim + 2;
  int max_xy = max_x * max_y;

  #pragma acc data create(p[0:1]) copy(p->distance[0:totalNodes])
  for(int level = start; level != end; level = (incr) ? level+1 : level-1) {
    // s - start, e - end
    int xs, xe, ys, ye;

    xs = max(1, level-(s->yDim + s->zDim))    , ys = max(1,level-(s->xDim + s->zDim));
    xe = min(s->xDim, level-(s->firstLevel-1)), ye = min(s->yDim, level-(s->firstLevel-1));

    int x, y, z, i, j, k, index;
    #pragma acc parallel
    {
      #pragma acc loop independent
      for(x = xs; x <= xe; x++) {
        #pragma acc loop independent
        for(y = ys; y <= ye; y++) {
          z = level - (x+y);
          if(z > 0 && z <= NZ-2) {
            i = abs(z-s->zSweepOff);
            j = abs(y-s->ySweepOff);
            k = abs(x-s->xSweepOff);
            index = i * max_xy + j * max_x + k;
            p->distance[index] = solveEikonal(p, index, NX, NY);
          }
        }
      } // end of acc parallel
    }
  }
}

#pragma acc routine seq
static double solveEikonal(Phi * p, int index, int max_x, int max_y) {

  int max_xy = max_x * max_y;

  double dist_new = 0;
  double dist_old = p->distance[index];

  double dx = p->dx, dy = p->dy, dz = p->dz;
  double minX = min(p->distance[index-1], p->distance[index+1]);
  double minY = min(p->distance[abs(index-max_x)], p->distance[abs(index+max_x)]);
  double minZ = min(p->distance[abs(index-max_xy)],p->distance[abs(index+max_xy)]);

  double m[] = { minX, minY, minZ} ;
  double d[] = { dx, dy, dz};

  // sort the mins 
  for(int i = 1; i < 3; i++){
    for(int j = 0; j < 3-i; j++) {
      if(m[j] > m[j+1]) {
        double tmp_m = m[j];
        double tmp_d = d[j];
        m[j] = m[j+1]; d[j] = d[j+1];
        m[j+1] = tmp_m; d[j+1] = tmp_d;
      }
    }
  }

  // simplifying the variables
  double m_0 = m[0], m_1 = m[1], m_2 = m[2];
  double d_0 = d[0], d_1 = d[1], d_2 = d[2]; 
  double m2_0 = m_0 * m_0, m2_1 = m_1 * m_1, m2_2 = m_2 * m_2;
  double d2_0 = d_0 * d_0, d2_1 = d_1 * d_1, d2_2 = d_2 * d_2;

  dist_new = m_0 + d_0;
  if(dist_new > m_1) {

    double s = sqrt(- m2_0 + 2 * m_0 * m_1 - m2_1 + d2_0 + d2_1); 
    dist_new = ( m_1 * d2_0 + m_0 * d2_1 + d_0 * d_1 * s) / (d2_0 + d2_1);

    if(dist_new > m_2) {

      double a = sqrt(- m2_0 * d2_1 - m2_0 * d2_2 + 2 * m_0 * m_1 * d2_2
                      - m2_1 * d2_0 - m2_1 * d2_2 + 2 * m_0 * m_2 * d2_1
                      - m2_2 * d2_0 - m2_2 * d2_1 + 2 * m_1 * m_2 * d2_0
                      + d2_0 * d2_1 + d2_0 * d2_2 + d2_1 * d2_2);

      dist_new = (m_2 * d2_0 * d2_1 + m_1 * d2_0 * d2_2 + m_0 * d2_1 * d2_2 + d_0 * d_1 * d_2 * a) /
                  (d2_0 * d2_1 + d2_0 * d2_2 + d2_1 * d2_2);
    }
  }

  return min(dist_old, dist_new);
}



static void set_distance_negative_inside(Phi * p, int totalNodes) {

  int    *l = &p->location[0];
  double *d = &p->distance[0];

  for(int i = 0; i < totalNodes; i++) {
    if(*l != DEFAULT_BORDER_LOCATION && *d != DEFAULT_BORDER_DISTANCE ) {
      if( *l == 1) *d = -1;
    }
    l++; d++;
  }

}

static void adjust_boundary( Phi * p ) {

  int x, y, z, xy, i, j, k;
  x  = NX;
  y  = NY;
  z  = NZ;
  xy = x * y;

  for(i = 0; i < z; i++){
    for(j = 0; j < y; j++){
      for(k = 0; k < x; k++){
        int I = i, J = j, K = k;
        I = (i == z-1) ? I-1 : (!i) ? I+1 : I;
        J = (j == y-1) ? J-1 : (!j) ? J+1 : J;
        K = (k == x-1) ? K-1 : (!k) ? K+1 : K;
        if( i != I || j != J || k != K) {
            int l_index = i * xy + j * x + k;
            int r_index = I * xy + J * x + K;
            p->distance[l_index] = p->distance[r_index];
        }
      }
    }
  }
}



/**************** vti_parser ********************************/

static void move_file_pointer(FILE *file_ptr, int lineNumber, int r) {
    char tmpStr[512];
    if(r) rewind(file_ptr);
    while (lineNumber > 0){
        fgets (tmpStr, 511, file_ptr);
        lineNumber--;
    }
}

void vti_get_dimensions(FILE *vti, double *d) {
    char tmpStr[512];
    rewind(vti);
    while (1) {
        fgets (tmpStr, 511, vti);
        if ( strstr(tmpStr, "ImageData WholeExtent") ) {
            sscanf(tmpStr, "    <ImageData WholeExtent=\"0 %lf 0 %lf 0 %lf\" Spacing=\"%lf %lf %lf\">",
                    &d[0], &d[1], &d[2], &d[3], &d[4], &d[5]);
            break;
        }
    }
}


void vti_get_data(FILE *vti, int *l, int b_l, double *d, double b_d, Grid3D g) {

    // move the file pointer to
    // line 6 from beginning
    move_file_pointer(vti, 6, 1);

    get_location(vti, l, b_l, g);

    // move the file pointer 2 lines
    // forward from its last position
    move_file_pointer(vti, 2, 0);

    get_distance(vti, d, b_d, g);

}

static void get_location(FILE *vti, int *l, int b_l, Grid3D g) {
    int i, j, k, *t = &l[0];
    for (i = 0; i < g.z; i++){
        for (j = 0; j < g.y; j++) {
            for (k = 0; k < g.x; k++) {
                // Border
                if (k == 0 || k == g.x-1 || j == 0 || j == g.y-1 || i == 0 || i == g.z-1 ) {
                    *(t++) = b_l;
                }
                else{ // Interior
                    fscanf(vti, "%d ", t++);
                }
            }
        }
    }
}



static void get_distance(FILE *vti, double *d, double b_d, Grid3D g) {
    int i, j, k;
    double *t = &d[0];
    for (i = 0; i < g.z; i++){
        for (j = 0; j < g.y; j++) {
            for (k = 0; k < g.x; k++) {
                // Border distance
                if (k == 0 || k == g.x-1 || j == 0 || j == g.y-1 || i == 0 || i == g.z-1 ) {
                    *(t++) = b_d;
                }
                else{ // Interior distance
                    fscanf(vti, "%lf ", t++);
                }
            }
        }
    }
}

Grid3D make_grid3D(int x, int y, int z){
    Grid3D g;
    g.x = x; g.y = y; g.z = z;

    return g;
}

当我丢弃openacc指令并使用[1-8]进程运行它时代码有效,但是当使用open acc编译器时,我得到了cudaError。

call to cuStreamSynchronize returned error 700: Illegal address during kernel execution

MPI编译:

mpicc -Wall -g -std=c99 -I/cm/shared/apps/openmpi/gcc/64/1.8.5_wocuda/include -L/cm/shared/apps/openmpi/gcc/64/1.8.5_wocuda/lib -lmpi test.c -o mpi_exec.out

OpenACC编译:

pgcc -acc -ta=tesla:managed -Minfo=accel -g -lm -I/cm/shared/apps/openmpi/gcc/64/1.8.5_wocuda/include -L/cm/shared/apps/openmpi/gcc/64/1.8.5_wocuda/lib -lmpi test.c -o oacc_exec.out

要运行可执行文件,您需要传入输入vti文件和输出文件名。

mpirun -np <1-8> <executable> input.vti outputName

链接到输入文件input.vti

我希望这段代码非常灵活,我希望它能够在运行[1-8]进程的同时在1个GPU的单个节点上运行,并且在[1-8]节点上运行,每个节点都有[1-2] GPUS。而且我没有使用CUDA MPS。

我的规格

GNU/Linux x86_64
NVIDIA GeForce GTX Titan CC: 3.5

pgcc 15.7-0 64-bit target on x86-64 Linux -tp sandybridge 
gcc (GCC) 4.8.1

对此有任何帮助或建议将不胜感激。

修改

** Compiling with OpenACC

`$ pgcc -fast -ta=tesla:managed -Minfo=accel -   I/cm/shared/apps/openmpi/gcc/64/1.8.5_wocuda/include -L/cm/shared/apps/openmpi/gcc/64/1.8.5_wocuda/lib -lmpi rcrovella.c -o withacc
PGC-W-0129-Floating point overflow. Check constants and constant expressions (rcrovella.c: 88)
PGC-W-0129-Floating point overflow. Check constants and constant expressions (rcrovella.c: 142)
PGC-W-0129-Floating point overflow. Check constants and constant expressions (rcrovella.c: 143)
PGC-W-0129-Floating point overflow. Check constants and constant expressions (rcrovella.c: 308)
fast_sweep:
    225, Generating copy(p[:1])
    228, Loop is parallelizable
    230, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
        228, #pragma acc loop gang /* blockIdx.y */
        230, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
solveEikonal:
    246, Generating acc routine seq
    262, Loop is parallelizable
    263, Loop carried dependence of m prevents parallelization
         Loop carried backward dependence of m prevents vectorization
         Loop carried dependence of d prevents parallelization
         Loop carried backward dependence of d prevents vectorization
PGC/x86-64 Linux 15.7-0: compilation completed with warnings`

** Compiling without OpenACC
pgcc -I/cm/shared/apps/openmpi/gcc/64/1.8.5_wocuda/include -L/cm/shared/apps/openmpi/gcc/64/1.8.5_wocuda/lib -lmpi rcrovella.c -o   noaccPGC-W-0129-Floating point overflow. Check constants and constant    expressions (rcrovella.c: 88)
PGC-W-0129-Floating point overflow. Check constants and constant expressions (rcrovella.c: 142)
PGC-W-0129-Floating point overflow. Check constants and constant expressions (rcrovella.c: 143)
PGC-W-0129-Floating point overflow. Check constants and constant expressions (rcrovella.c: 308)
PGC/x86-64 Linux 15.7-0: compilation completed with warnings

** Running with OpenACC
$ mpirun -n 1 withacc ../my_test/input.vti withacc1
PE: [0] - performing sweep number ..... [1/8]
PE: [0] - completed sweep number ...... [1/8]
PE: [0] - performing sweep number ..... [2/8]
PE: [0] - completed sweep number ...... [2/8]
PE: [0] - performing sweep number ..... [3/8]
PE: [0] - completed sweep number ...... [3/8]
PE: [0] - performing sweep number ..... [4/8]
PE: [0] - completed sweep number ...... [4/8]
PE: [0] - performing sweep number ..... [5/8]
PE: [0] - completed sweep number ...... [5/8]
PE: [0] - performing sweep number ..... [6/8]
PE: [0] - completed sweep number ...... [6/8]
PE: [0] - performing sweep number ..... [7/8]
PE: [0] - completed sweep number ...... [7/8]
PE: [0] - performing sweep number ..... [8/8]
PE: [0] - completed sweep number ...... [8/8]
withacc1 file created

** Running without OpenACC
$ mpirun -n 1 noacc ../my_test/input.vti noacc1
PE: [0] - performing sweep number ..... [1/8]
PE: [0] - completed sweep number ...... [1/8]
PE: [0] - performing sweep number ..... [2/8]
PE: [0] - completed sweep number ...... [2/8]
PE: [0] - performing sweep number ..... [3/8]
PE: [0] - completed sweep number ...... [3/8]
PE: [0] - performing sweep number ..... [4/8]
PE: [0] - completed sweep number ...... [4/8]
PE: [0] - performing sweep number ..... [5/8]
PE: [0] - completed sweep number ...... [5/8]
PE: [0] - performing sweep number ..... [6/8]
PE: [0] - completed sweep number ...... [6/8]
PE: [0] - performing sweep number ..... [7/8]
PE: [0] - completed sweep number ...... [7/8]
PE: [0] - performing sweep number ..... [8/8]
PE: [0] - completed sweep number ...... [8/8]
noacc1 file created

** Compare
$ diff -q noacc1.txt withacc1.txt
Files noacc1.txt and withacc1.txt differ

1 个答案:

答案 0 :(得分:2)

  

同样在这个版本中,我根本无法使用openacc,但是解决这个问题应该对我有很大帮助。

这是我发现的:

  1. 使用托管内存设施时:

    -ta=tesla:managed
    

    通常我们的代码中不包含data指令或子句。我们的想法是让cuda管理的内存运行时为我们管理数据移动。所以我评论了我认为是两个“无关”的数据指令。

  2. 我不相信你的parallel加速器指令是正确形成的。我的编译器(PGI 15.7)咆哮我并发区域内的independent指令不正确:

    PGCC-S-0155-Illegal context(parallel) for independent  (t2.c: 228)
    

    #pragma acc parallel更改为#pragma acc kernels是一种可行的解决方法。

  3. 您的代码会围绕使用INFINITY发出一些编译器警告。由于这些只是警告,我没有费心去解决它们。

  4. 出于某种原因,我发现编译器在进入加速器区域时没有正确处理SweepInfo结构(s)。为了解决这个问题,我修改了这个:

    int x, y, z, i, j, k, index;
    #pragma acc parallel
    {
      #pragma acc loop independent
      for(x = xs; x <= xe; x++) {
        #pragma acc loop independent
        for(y = ys; y <= ye; y++) {
          z = level - (x+y);
          if(z > 0 && z <= NZ-2) {
            i = abs(z-s->zSweepOff);
            j = abs(y-s->ySweepOff);
            k = abs(x-s->xSweepOff);
    

    到此:

    int x, y, z, i, j, k, index;
    int xSO = s->xSweepOff;
    int ySO = s->ySweepOff;
    int zSO = s->zSweepOff;
    #pragma acc kernels
    {
      #pragma acc loop independent
      for(x = xs; x <= xe; x++) {
        #pragma acc loop independent
        for(y = ys; y <= ye; y++) {
          z = level - (x+y);
          if(z > 0 && z <= NZ-2) {
            i = abs(z-zSO);
            j = abs(y-ySO);
            k = abs(x-xSO);
    

    我可能会更多地咀嚼这个。我认为这里有一个我不理解的限制,或者是编译器错误。

  5. 通过上述更改,我能够让您的代码在没有任何明显问题的情况下运行完成。这是我修改过的代码:

    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>
    #include <string.h>
    #include "mpi.h"
    
    #define min(a,b) (a > b) ? b : a
    #define max(a,b) (a < b) ? b : a
    
    #define NPES 8 // max number of PEs allowed
    #define DEFAULT_BORDER_LOCATION   -1
    #define DEFAULT_BORDER_DISTANCE   INFINITY
    #define DEFAULT_INTERIOR_DISTANCE 90000
    
    typedef struct {
      int order;
      int firstLevel, lastLevel, level;
      int xDim, yDim, zDim;
      int xSweepOff, ySweepOff, zSweepOff;
      double dx, dy, dz;
    } SweepInfo;
    
    typedef struct {
      double   dx, dy, dz;
      int * location;
      double * distance;
    } Phi;
    
    typedef struct {
      int x, y, z;
    } Grid3D;
    
    
    
    void calc_dist_field( Phi * p, int totalNodes );
    void write_to_file(double * dist);
    static SweepInfo make_sweepInfo( Phi * p, int my_rank );
    static void fast_sweep( Phi * p, SweepInfo * s );
    static double solveEikonal(Phi * p, int index, int max_x, int max_y);
    static void update_distance(Phi * p, int totalNodes);
    static void set_distance_negative_inside(Phi * p, int totalNodes);
    static void adjust_boundary( Phi * p );
    
    
    // public method declarations
    Grid3D make_grid3D(int x, int y, int z);
    void vti_get_dimensions(FILE *vti, double *d);
    void vti_get_data(FILE *vti, int *l, int b_l, double *d, double b_d, Grid3D g);
    
    // private method declarations
    static void move_file_pointer(FILE *file_ptr, int lineNumber, int r);
    static void get_location(FILE *vti, int *l, int b_l, Grid3D g);
    static void get_distance(FILE *vti, double *d, double b_d, Grid3D g);
    
    static int npes;          // Number of PEs
    static int my_rank;       // Rank of the PE
    static char * fileName;
    static char * outfileName;
    
    static int NX, NY, NZ, totalNodes;
    
    int main(int argc, char *argv[]) {
    
      // MPI startup routine
      MPI_Init(&argc, &argv);
      MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
      MPI_Comm_size(MPI_COMM_WORLD, &npes);
    
      fileName    = argv[1];
      outfileName = argv[2];
      FILE *f = fopen(fileName, "r");
    
      double dims[6];
      vti_get_dimensions(f, dims);
      NX = dims[0] + 3;
      NY = dims[1] + 3;
      NZ = dims[2] + 3;
    
      totalNodes = NX * NY * NZ;
      Phi *p = (Phi *) malloc(sizeof(Phi));
      p->location = (int *) malloc(sizeof(int) * totalNodes);
      p->distance = (double *) malloc(sizeof(double) * totalNodes);
    
      p->dx = dims[3]; p->dy = dims[4]; p->dz = dims[5];
    
      vti_get_data( f, p->location, DEFAULT_BORDER_LOCATION,
                       p->distance, DEFAULT_BORDER_DISTANCE,
                    make_grid3D(NX, NY, NZ));
    
    
      update_distance(p, totalNodes);
    
      calc_dist_field(p, totalNodes);
    
      MPI_Finalize();
      return 0;
    }
    
    void calc_dist_field( Phi * p, int totalNodes ) {
    
      int sweepNumber = my_rank + 1;
      double * tmp_dist;
    
      MPI_Barrier(MPI_COMM_WORLD);
      if(my_rank == 0){
        tmp_dist = (double *) malloc( totalNodes * sizeof(double) );
      }
    
      // sn represents the sweep number
      for( int sn = sweepNumber; sn <= NPES; sn += npes) {
        SweepInfo s = make_sweepInfo(p, sn);
    
        printf("PE: [%d] - performing sweep number ..... [%d/%d]\n", my_rank, sn, NPES);
    
        fast_sweep(p, &s);
    
        printf("PE: [%d] - completed sweep number ...... [%d/%d]\n", my_rank, sn, NPES);
    
      }
    
      MPI_Barrier(MPI_COMM_WORLD);
    //  #pragma acc update host(p->distance[0:totalNodes])
      MPI_Reduce(p->distance, tmp_dist, totalNodes, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
    
      if( my_rank == 0 ) {
        free( p->distance );
        p->distance = tmp_dist;
        set_distance_negative_inside(p, totalNodes);
        adjust_boundary(p);
        write_to_file(p->distance);
        printf("%s file created\n", outfileName);
      }
    
    }
    
    static void update_distance(Phi * p, int totalNodes) {
    
      int    *l = &p->location[0];
      double *d = &p->distance[0];
    
      for(int i = 0; i < totalNodes; i++) {
        if(*l != DEFAULT_BORDER_LOCATION && *d != DEFAULT_BORDER_DISTANCE ) {
          *d = (*l == 1 && *d == INFINITY) ? -1 : (*d > 0.0 || *d < 0.0) ? *d : DEFAULT_INTERIOR_DISTANCE;
        }
        l++; d++;
      }
    
    }
    
    
    
    void write_to_file(double * dist) {
    
      int x = NX;
      int y = NY;
      int z = NZ;
      char fname[255];
      sprintf(fname, "%s.txt", outfileName);
      FILE *fp = fopen(fname, "w");
    
      int i,j,k;
      double *t = &dist[0];
      for(i = 0; i < z; i++){
        for(j = 0; j < y; j++){
          for(k = 0; k < x; k++) {
        fprintf(fp, "%f  ", *(t++));
          }
          fprintf(fp, "\n");
        }
        fprintf(fp, "\n");
      }
    
    }
    
    
    static SweepInfo make_sweepInfo( Phi * p, int my_rank ) {
      SweepInfo s;
    
      s.order      = my_rank;
      s.firstLevel = 3;
      s.lastLevel  = (NX + NY + NZ) - 6;
    
      s.xDim = NX-2; s.dx = p->dx;
      s.yDim = NY-2; s.dy = p->dy;
      s.zDim = NZ-2; s.dz = p->dz;
    
      s.xSweepOff = (s.order == 4 || s.order == 8 ) ? s.xDim + 1 : 0;
      s.ySweepOff = (s.order == 2 || s.order == 6 ) ? s.yDim + 1 : 0;
      s.zSweepOff = (s.order == 3 || s.order == 7 ) ? s.zDim + 1 : 0;
    
      return s;
    }
    
    static void fast_sweep( Phi * p, SweepInfo * s ) {
    
      int start, end, incr;
    
      start = ( s->order == 2 || s->order == 5 || s->order == 7 || s->order == 8 ) ? s->lastLevel : s->firstLevel;
    
      if ( start == s->firstLevel ) {
        end  = s->lastLevel + 1;
        incr = 1;
      }
      else {
        end  = s->firstLevel - 1;
        incr = 0;
      }
    
      int max_x  = s->xDim + 2;
      int max_y  = s->yDim + 2;
      int max_xy = max_x * max_y;
    
      //#pragma acc data create(p[0:1]) copy(p->distance[0:totalNodes])
      for(int level = start; level != end; level = (incr) ? level+1 : level-1) {
        // s - start, e - end
        int xs, xe, ys, ye;
    
        xs = max(1, level-(s->yDim + s->zDim))    , ys = max(1,level-(s->xDim + s->zDim));
        xe = min(s->xDim, level-(s->firstLevel-1)), ye = min(s->yDim, level-(s->firstLevel-1));
    
        int x, y, z, i, j, k, index;
        int xSO = s->xSweepOff;
        int ySO = s->ySweepOff;
        int zSO = s->zSweepOff;
        #pragma acc kernels
        {
          #pragma acc loop independent
          for(x = xs; x <= xe; x++) {
            #pragma acc loop independent
            for(y = ys; y <= ye; y++) {
              z = level - (x+y);
              if(z > 0 && z <= NZ-2) {
                i = abs(z-zSO);
                j = abs(y-ySO);
                k = abs(x-xSO);
                index = i * max_xy + j * max_x + k;
                p->distance[index] = solveEikonal(p, index, NX, NY);
              }
            }
          } // end of acc parallel
        }
      }
    }
    
    #pragma acc routine seq
    static double solveEikonal(Phi * p, int index, int max_x, int max_y) {
    
      int max_xy = max_x * max_y;
    
      double dist_new = 0;
      double dist_old = p->distance[index];
    
      double dx = p->dx, dy = p->dy, dz = p->dz;
      double minX = min(p->distance[index-1], p->distance[index+1]);
      double minY = min(p->distance[abs(index-max_x)], p->distance[abs(index+max_x)]);
      double minZ = min(p->distance[abs(index-max_xy)],p->distance[abs(index+max_xy)]);
    
      double m[] = { minX, minY, minZ} ;
      double d[] = { dx, dy, dz};
    
      // sort the mins 
      for(int i = 1; i < 3; i++){
        for(int j = 0; j < 3-i; j++) {
          if(m[j] > m[j+1]) {
            double tmp_m = m[j];
            double tmp_d = d[j];
            m[j] = m[j+1]; d[j] = d[j+1];
            m[j+1] = tmp_m; d[j+1] = tmp_d;
          }
        }
      }
    
      // simplifying the variables
      double m_0 = m[0], m_1 = m[1], m_2 = m[2];
      double d_0 = d[0], d_1 = d[1], d_2 = d[2]; 
      double m2_0 = m_0 * m_0, m2_1 = m_1 * m_1, m2_2 = m_2 * m_2;
      double d2_0 = d_0 * d_0, d2_1 = d_1 * d_1, d2_2 = d_2 * d_2;
    
      dist_new = m_0 + d_0;
      if(dist_new > m_1) {
    
        double s = sqrt(- m2_0 + 2 * m_0 * m_1 - m2_1 + d2_0 + d2_1); 
        dist_new = ( m_1 * d2_0 + m_0 * d2_1 + d_0 * d_1 * s) / (d2_0 + d2_1);
    
        if(dist_new > m_2) {
    
          double a = sqrt(- m2_0 * d2_1 - m2_0 * d2_2 + 2 * m_0 * m_1 * d2_2
                          - m2_1 * d2_0 - m2_1 * d2_2 + 2 * m_0 * m_2 * d2_1
                          - m2_2 * d2_0 - m2_2 * d2_1 + 2 * m_1 * m_2 * d2_0
                          + d2_0 * d2_1 + d2_0 * d2_2 + d2_1 * d2_2);
    
          dist_new = (m_2 * d2_0 * d2_1 + m_1 * d2_0 * d2_2 + m_0 * d2_1 * d2_2 + d_0 * d_1 * d_2 * a) /
                      (d2_0 * d2_1 + d2_0 * d2_2 + d2_1 * d2_2);
        }
      }
    
      return min(dist_old, dist_new);
    }
    
    
    
    static void set_distance_negative_inside(Phi * p, int totalNodes) {
    
      int    *l = &p->location[0];
      double *d = &p->distance[0];
    
      for(int i = 0; i < totalNodes; i++) {
        if(*l != DEFAULT_BORDER_LOCATION && *d != DEFAULT_BORDER_DISTANCE ) {
          if( *l == 1) *d = -1;
        }
        l++; d++;
      }
    
    }
    
    static void adjust_boundary( Phi * p ) {
    
      int x, y, z, xy, i, j, k;
      x  = NX;
      y  = NY;
      z  = NZ;
      xy = x * y;
    
      for(i = 0; i < z; i++){
        for(j = 0; j < y; j++){
          for(k = 0; k < x; k++){
            int I = i, J = j, K = k;
            I = (i == z-1) ? I-1 : (!i) ? I+1 : I;
            J = (j == y-1) ? J-1 : (!j) ? J+1 : J;
            K = (k == x-1) ? K-1 : (!k) ? K+1 : K;
            if( i != I || j != J || k != K) {
                int l_index = i * xy + j * x + k;
                int r_index = I * xy + J * x + K;
                p->distance[l_index] = p->distance[r_index];
            }
          }
        }
      }
    }
    
    
    
    /**************** vti_parser ********************************/
    
    static void move_file_pointer(FILE *file_ptr, int lineNumber, int r) {
        char tmpStr[512];
        if(r) rewind(file_ptr);
        while (lineNumber > 0){
            fgets (tmpStr, 511, file_ptr);
            lineNumber--;
        }
    }
    
    void vti_get_dimensions(FILE *vti, double *d) {
        char tmpStr[512];
        rewind(vti);
        while (1) {
            fgets (tmpStr, 511, vti);
            if ( strstr(tmpStr, "ImageData WholeExtent") ) {
                sscanf(tmpStr, "    <ImageData WholeExtent=\"0 %lf 0 %lf 0 %lf\" Spacing=\"%lf %lf %lf\">",
                        &d[0], &d[1], &d[2], &d[3], &d[4], &d[5]);
                break;
            }
        }
    }
    
    
    void vti_get_data(FILE *vti, int *l, int b_l, double *d, double b_d, Grid3D g) {
    
        // move the file pointer to
        // line 6 from beginning
        move_file_pointer(vti, 6, 1);
    
        get_location(vti, l, b_l, g);
    
        // move the file pointer 2 lines
        // forward from its last position
        move_file_pointer(vti, 2, 0);
    
        get_distance(vti, d, b_d, g);
    
    }
    
    static void get_location(FILE *vti, int *l, int b_l, Grid3D g) {
        int i, j, k, *t = &l[0];
        for (i = 0; i < g.z; i++){
            for (j = 0; j < g.y; j++) {
                for (k = 0; k < g.x; k++) {
                    // Border
                    if (k == 0 || k == g.x-1 || j == 0 || j == g.y-1 || i == 0 || i == g.z-1 ) {
                        *(t++) = b_l;
                    }
                    else{ // Interior
                        fscanf(vti, "%d ", t++);
                    }
                }
            }
        }
    }
    
    
    
    static void get_distance(FILE *vti, double *d, double b_d, Grid3D g) {
        int i, j, k;
        double *t = &d[0];
        for (i = 0; i < g.z; i++){
            for (j = 0; j < g.y; j++) {
                for (k = 0; k < g.x; k++) {
                    // Border distance
                    if (k == 0 || k == g.x-1 || j == 0 || j == g.y-1 || i == 0 || i == g.z-1 ) {
                        *(t++) = b_d;
                    }
                    else{ // Interior distance
                        fscanf(vti, "%lf ", t++);
                    }
                }
            }
        }
    }
    
    Grid3D make_grid3D(int x, int y, int z){
        Grid3D g;
        g.x = x; g.y = y; g.z = z;
    
        return g;
    }
    

    这是我的编译命令:

    pgc++ -fast -acc -ta=tesla:managed -Minfo=accel -I/opt/pgi/linux86-64/15.7/mpi/mpich/include -L/opt/pgi/linux86-64/15.7/mpi/mpich/lib -lmpi t2.c -o t2
    

    这是输出:

    $ LD_LIBRARY_PATH=/opt/pgi/linux86-64/15.7/mpi/mpich/lib ./t2 input.vti output
    PE: [0] - performing sweep number ..... [1/8]
    PE: [0] - completed sweep number ...... [1/8]
    PE: [0] - performing sweep number ..... [2/8]
    PE: [0] - completed sweep number ...... [2/8]
    PE: [0] - performing sweep number ..... [3/8]
    PE: [0] - completed sweep number ...... [3/8]
    PE: [0] - performing sweep number ..... [4/8]
    PE: [0] - completed sweep number ...... [4/8]
    PE: [0] - performing sweep number ..... [5/8]
    PE: [0] - completed sweep number ...... [5/8]
    PE: [0] - performing sweep number ..... [6/8]
    PE: [0] - completed sweep number ...... [6/8]
    PE: [0] - performing sweep number ..... [7/8]
    PE: [0] - completed sweep number ...... [7/8]
    PE: [0] - performing sweep number ..... [8/8]
    PE: [0] - completed sweep number ...... [8/8]
    output file created
    $
    

    在我的情况下,它似乎比非OpenACC版本(没有-acc -ta=tesla:managed -Minfo=accel编译)和I diff运行得更快 - 创建的输出文件和非OpenACC之间的相同OpenACC版本。

    我也尝试使用2个MPI等级运行此代码。它似乎在没有崩溃的情况下运行:

    $ LD_LIBRARY_PATH=/opt/pgi/linux86-64/15.7/mpi/mpich/lib /opt/pgi/linux86-64/15.7/mpi/mpich/bin/mpirun -n 2 ./t2 input.vti output
    PE: [1] - performing sweep number ..... [2/8]
    PE: [0] - performing sweep number ..... [1/8]
    PE: [0] - completed sweep number ...... [1/8]
    PE: [0] - performing sweep number ..... [3/8]
    PE: [1] - completed sweep number ...... [2/8]
    PE: [1] - performing sweep number ..... [4/8]
    PE: [0] - completed sweep number ...... [3/8]
    PE: [0] - performing sweep number ..... [5/8]
    PE: [1] - completed sweep number ...... [4/8]
    PE: [1] - performing sweep number ..... [6/8]
    PE: [0] - completed sweep number ...... [5/8]
    PE: [0] - performing sweep number ..... [7/8]
    PE: [1] - completed sweep number ...... [6/8]
    PE: [1] - performing sweep number ..... [8/8]
    PE: [0] - completed sweep number ...... [7/8]
    PE: [1] - completed sweep number ...... [8/8]
    output file created
    $
    

    输出数据文件与单排版本不同,但它与两级非OpenACC运行生成的版本匹配。因此,如果存在任何剩余问题,我认为它们与MPI相关,而不是与OpenACC相关。

    修改

    为了说明最后一点,让我们从图片中取出OpenACC,但只需使用PGI 15.7工具链附带的MPI(MPICH):

    $ /opt/pgi/linux86-64/15.7/mpi/mpich/bin/mpicc t2.c -o t2 -lmpi
    PGC-W-0129-Floating point overflow. Check constants and constant expressions (t2.c: 88)
    PGC-W-0129-Floating point overflow. Check constants and constant expressions (t2.c: 142)
    PGC-W-0129-Floating point overflow. Check constants and constant expressions (t2.c: 143)
    PGC-W-0129-Floating point overflow. Check constants and constant expressions (t2.c: 308)
    PGC/x86-64 Linux 15.7-0: compilation completed with warnings
    $ LD_LIBRARY_PATH=/opt/pgi/linux86-64/15.7/mpi/mpich/lib /opt/pgi/linux86-64/15.7/mpi/mpich/bin/mpirun -n 1 ./t2 input.vti output1rank
    PE: [0] - performing sweep number ..... [1/8]
    PE: [0] - completed sweep number ...... [1/8]
    PE: [0] - performing sweep number ..... [2/8]
    PE: [0] - completed sweep number ...... [2/8]
    PE: [0] - performing sweep number ..... [3/8]
    PE: [0] - completed sweep number ...... [3/8]
    PE: [0] - performing sweep number ..... [4/8]
    PE: [0] - completed sweep number ...... [4/8]
    PE: [0] - performing sweep number ..... [5/8]
    PE: [0] - completed sweep number ...... [5/8]
    PE: [0] - performing sweep number ..... [6/8]
    PE: [0] - completed sweep number ...... [6/8]
    PE: [0] - performing sweep number ..... [7/8]
    PE: [0] - completed sweep number ...... [7/8]
    PE: [0] - performing sweep number ..... [8/8]
    PE: [0] - completed sweep number ...... [8/8]
    output1rank file created
    $ LD_LIBRARY_PATH=/opt/pgi/linux86-64/15.7/mpi/mpich/lib /opt/pgi/linux86-64/15.7/mpi/mpich/bin/mpirun -n 2 ./t2 input.vti output2rank
    PE: [0] - performing sweep number ..... [1/8]
    PE: [1] - performing sweep number ..... [2/8]
    PE: [1] - completed sweep number ...... [2/8]
    PE: [1] - performing sweep number ..... [4/8]
    PE: [0] - completed sweep number ...... [1/8]
    PE: [0] - performing sweep number ..... [3/8]
    PE: [1] - completed sweep number ...... [4/8]
    PE: [1] - performing sweep number ..... [6/8]
    PE: [0] - completed sweep number ...... [3/8]
    PE: [0] - performing sweep number ..... [5/8]
    PE: [1] - completed sweep number ...... [6/8]
    PE: [1] - performing sweep number ..... [8/8]
    PE: [1] - completed sweep number ...... [8/8]
    PE: [0] - completed sweep number ...... [5/8]
    PE: [0] - performing sweep number ..... [7/8]
    PE: [0] - completed sweep number ...... [7/8]
    output2rank file created
    $ diff -q output1rank.txt output2rank.txt
    Files output1rank.txt and output2rank.txt differ
    $