使用OpenACC时的零输出

时间:2018-03-18 06:53:22

标签: c++ gpu openacc pgi

我使用PGI社区版17.10来编译和运行代码。当我添加OpenACC的指令时输出错误的原因是什么? 如果能帮助我,为什么会这样? 提前致谢, sajad

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <assert.h>
#include <openacc.h>
#include<time.h>
#include <string.h>
#include <malloc.h>
#include <cuda_runtime_api.h>

#define NX 2
#define NY 2
#define NZ 2
int main(void)
{
static int  i, j, k;
static double A[NX][NY][NZ]=2 ,B[NX][NY][NZ]=10.,C[NX]=10.,D[NY]=10.,E[NZ]=10.;

FILE *file;
file = fopen("BB-and-A.csv", "w");
#pragma acc  data copy( A ,B,C,D,E,i, j, k)
      {
#pragma acc   kernels loop private(i, j, k)

            for (i = 0; i <= NX; i++) {
                for (j =0; j <= NY ; j++) {
                    for (k =0; k <= NZ ; k++) {
                            C[i]=i;
                            D[j]=j;
                            E[k]=k;
                    }
                }
            }
}
    for (i = 0; i <= NX; i++) {
                for (j =0; j <= NY ; j++) {
                    for (k =0; k <= NZ ; k++) {
                        fprintf(file, "%e, %e, %e \n", C[i], D[j],E[k] );
                    }
                }
    }
fclose(file);
}

1 个答案:

答案 0 :(得分:1)

此代码存在许多问题。

1)您的数组边界不正确。由于循环从1变为&lt; = N但数组只有N个成员,因此您要从数组的末尾写入。

2)你的循环不可并行化,因为你要从多个循环迭代中写入每个元素。为了解决这个问题,我要制作这三个独立的循环。

3)循环索引变量不应该是静态的。这将它们置于全局存储中,从而导致依赖性。虽然您可以通过将它们放在private子句中来解决这个问题,但最好删除静态并让编译器隐式私有化它们。

4)无需复制循环索引变量。

尝试以下内容:

% cat test2.c 
 #include <stdio.h> 
  #include <math.h> 
  #include <stdlib.h> 
  #include <assert.h> 
  #include <openacc.h> 
  #include<time.h> 
  #include <string.h> 
  #include <malloc.h> 
 // #include <cuda_runtime_api.h> 

  #define NX 2 
  #define NY 2 
  #define NZ 2 


  int main(void) 
  { 
  int i, j, k; 

  static double A[NX+1][NY+1][NZ+1]=2 ,B[NX+1][NY+1][NZ+1]=10.,C[NX+1]=10.,D[NY+1]=10.,E[NZ+1]=10.; 
  FILE *file; 
  file = fopen("BB-and-A.csv", "w"); 
  #pragma acc data copy(A,B,C,D,E) 
  { 
  #pragma acc kernels 
  { 
  for (i = 0; i <= NX; i++) C[i]=i; 
  for (j =0; j <= NY ; j++) D[j]=j; 
  for (k =0; k <= NZ ; k++) E[k]=k; 
  } } 
  for (i = 0; i <= NX; i++) { 
  for (j =0; j <= NY ; j++) { 
  for (k =0; k <= NZ ; k++) { 
    fprintf(file,"%e, %e, %e \n", C[i], D[j],E[k] ); 
  } } } 

  fclose(file); 
  } 
 % pgcc test2.c -ta=tesla:cc60 -Minfo=accel 
 main: 
      23, Generating copy(A[:][:][:],B[:][:][:],C[:],E[:],D[:]) 
      27, Loop is parallelizable 
          Accelerator kernel generated 
          Generating Tesla code 
          27, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ 
      28, Loop is parallelizable 
          Accelerator kernel generated 
          Generating Tesla code 
          28, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ 
      29, Loop is parallelizable 
          Accelerator kernel generated 
          Generating Tesla code 
          29, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ 
 % a.out 
 % cat BB-and-A.csv 
 0.000000e+00, 0.000000e+00, 0.000000e+00 
 0.000000e+00, 0.000000e+00, 1.000000e+00 
 0.000000e+00, 0.000000e+00, 2.000000e+00 
 0.000000e+00, 1.000000e+00, 0.000000e+00 
 0.000000e+00, 1.000000e+00, 1.000000e+00 
 0.000000e+00, 1.000000e+00, 2.000000e+00 
 0.000000e+00, 2.000000e+00, 0.000000e+00 
 0.000000e+00, 2.000000e+00, 1.000000e+00 
 0.000000e+00, 2.000000e+00, 2.000000e+00 
 1.000000e+00, 0.000000e+00, 0.000000e+00 
 1.000000e+00, 0.000000e+00, 1.000000e+00 
 1.000000e+00, 0.000000e+00, 2.000000e+00 
 1.000000e+00, 1.000000e+00, 0.000000e+00 
 1.000000e+00, 1.000000e+00, 1.000000e+00 
 1.000000e+00, 1.000000e+00, 2.000000e+00 
 1.000000e+00, 2.000000e+00, 0.000000e+00 
 1.000000e+00, 2.000000e+00, 1.000000e+00 
 1.000000e+00, 2.000000e+00, 2.000000e+00 
 2.000000e+00, 0.000000e+00, 0.000000e+00 
 2.000000e+00, 0.000000e+00, 1.000000e+00 
 2.000000e+00, 0.000000e+00, 2.000000e+00 
 2.000000e+00, 1.000000e+00, 0.000000e+00 
 2.000000e+00, 1.000000e+00, 1.000000e+00 
 2.000000e+00, 1.000000e+00, 2.000000e+00 
 2.000000e+00, 2.000000e+00, 0.000000e+00 
 2.000000e+00, 2.000000e+00, 1.000000e+00 
 2.000000e+00, 2.000000e+00, 2.000000e+00