CUDA事件添加矩阵时出错

时间:2016-03-04 02:12:08

标签: cuda

我遇到了一个我不知道如何解决它的问题,如果可能的话,你能帮助我吗?

我想测量两个1D阵列的执行时间, 所以我尝试在我的GF820 M上实现并执行内核,添加内核的结果(我把它们放在一个文件中)是正确的,但是这个内核获得的executin的时间值非常低,我对这些值的质疑如果有的话是否正确。

我认为问题在于指令执行时间测量,但我遵循CUDA编程指南中存在的相同语法。

执行在GF 820 M上完成:

表示WIDTH = 20480,线程数= 512;时间= 0,01812毫秒

并且对于WIDTH = 20480和线程数= 1024;时间= 0,021920毫秒

提前感谢您提供的任何帮助。

代码:

  __global__ void kernel_sum(float *x, float *y, float *z, int size) {
      int gtid = threadIdx.x + blockIdx.x * blockDim.x;
        z[gtid] = x[gtid] + y[gtid];
        }

 int main ()

{           
          cudaError_t cudaStatus;
          float *array1_d , *array2_d ,*M_result_array_d ; // device array
          const int WIDTH=20480 ;

          float  *array1_h = (float *)malloc(WIDTH*sizeof(float));
          float  *array2_h= (float*)malloc(WIDTH*sizeof(float));
          float  *M_result_array_h = (float *)malloc(WIDTH*sizeof(float));

        //remplissage des deux matrices d'entree

       for ( int i = 0 ; i<WIDTH; i++ ) 
                             {  array1_h[i]=i; 
                               array2_h[i]=i; }                         

      // Allocate GPU buffers for 2 matrices (two input, one output) 
        cudaStatus = cudaMalloc((void **) &array1_d , WIDTH*sizeof (float));
   if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); }  

        cudaStatus = cudaMalloc((void **) &array2_d , WIDTH*sizeof (float));
   if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); }  

     //allocating memory for resultent device array
    cudaStatus = cudaMalloc((void **) &M_result_array_d , WIDTH*sizeof   (float) ) ;
    if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMalloc failed!"); } 

   //copy host array to device array;
cudaStatus = cudaMemcpy ( array1_d , array1_h , WIDTH*sizeof (float) , cudaMemcpyHostToDevice ) ;

 if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy 0 failed!"); }

    cudaStatus = cudaMemcpy ( array2_d , array2_h,WIDTH*sizeof(float),cudaMemcpyHostToDevice ) ;
if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy 1 failed!"); }

           //lancer kernel
              const int t=512;
              int NUMBER_OF_BLOCKS = WIDTH/t;
              cudaEvent_t start, stop;
               float time;
               cudaEventCreate(&start);
               cudaEventCreate(&stop);
              cudaEventRecord(start, 0);
      kernel_sum <<<NUMBER_OF_BLOCKS,t>>> ( array1_d , array2_d ,M_result_array_d ,WIDTH) ;
              cudaEventRecord(stop, 0);
              cudaEventSynchronize(stop);
              cudaEventElapsedTime(&time, start, stop);
              cudaEventDestroy(start);
              cudaEventDestroy(stop);

cudaStatus = cudaGetLastError();if (cudaStatus != cudaSuccess) {fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); }

//afffichage de temps du kernel et enregister dans un fichier 
        printf ("N=%5d  NT=%5d Time=%5f \n",WIDTH,t,time);

//copy back result_array_d to result_array_h

 cudaStatus = cudaMemcpy(M_result_array_h , M_result_array_d , WIDTH*sizeof(float) ,cudaMemcpyDeviceToHost) ;

    if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy result from GPU to host failed!"); } 



    /*//print result

  printf("A \n");
  for ( int i = 0 ; i<WIDTH ; i++ )  
         { printf("%f  ", array1_h[i] );printf ("\n") ;
          } printf ("\n") ;
             printf ("\n");

 printf("B \n"); for ( int i = 0 ; i<WIDTH; i++ )  
          {  printf("%f  ", array2_h[i] );printf ("\n") ;
          } printf ("\n") ;
            printf ("\n");


  printf("Resultat \n");
  for (int i = 0 ; i<WIDTH ; i++ )
     {   printf ("%f   ",M_result_array_h[i] ) ;printf ("\n") ; }
      printf ("\n") ;  
       printf ("\n");

       */

    FILE* fichier = NULL;
    fichier = fopen("resultat.text", "a+");
            if (fichier != NULL)
                {   for (int i = 0 ; i<WIDTH ; i++ )
                    fprintf(fichier, "%5f \n ",M_result_array_h[i]);
                      fprintf(fichier, "\n ");      
                         fclose(fichier);
                                              } 
           else  {printf("Impossible d'ouvrir le fichier"); }


    cudaFree(array1_d);
    cudaFree(array2_d);
    cudaFree(M_result_array_d);





  system("pause") ; 

}

1 个答案:

答案 0 :(得分:1)

我认为这些时间有意义。但如果您真的想知道内核的性能,那么使用 nvprof nvvp 比使用 cudaEventElapsedTime 的时间更好。

这是在Gforce Titan X中使用nvprof:

执行的
==20602== NVPROF is profiling process 20602, command: ./test
N=20480  NT=  512 Time=0.026720 
==20602== Profiling application: ./test
==20602== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
63.50%  19.648us         2  9.8240us  9.7920us  9.8560us  [CUDA memcpy HtoD]
28.75%  8.8960us         1  8.8960us  8.8960us  8.8960us  [CUDA memcpy DtoH]
 7.76%  2.4000us         1  2.4000us  2.4000us  2.4000us  kernel_sum(float*, float*, float*, int)

==20602== API calls:
Time(%)      Time     Calls       Avg       Min       Max  Name
97.28%  156.96ms         3  52.321ms  3.1070us  156.95ms  cudaMalloc
 2.15%  3.4675ms       332  10.444us     180ns  437.61us  cuDeviceGetAttribute
 0.20%  317.27us         4  79.318us  79.038us  79.945us  cuDeviceTotalMem
 0.17%  270.27us         4  67.567us  64.960us  74.955us  cuDeviceGetName
 0.11%  171.56us         3  57.185us  4.4910us  142.25us  cudaFree
 0.07%  117.87us         3  39.288us  29.311us  57.825us  cudaMemcpy
 0.01%  20.980us         1  20.980us  20.980us  20.980us  cudaLaunch
 0.00%  5.6630us         1  5.6630us  5.6630us  5.6630us  cudaEventSynchronize
 0.00%  5.1940us         2  2.5970us  2.2760us  2.9180us  cudaEventRecord
 0.00%  3.1650us         2  1.5820us     542ns  2.6230us  cudaEventCreate
 0.00%  3.0440us         4     761ns     167ns  2.3220us  cudaSetupArgument
 0.00%  2.6610us         8     332ns     204ns     619ns  cuDeviceGet
 0.00%  1.8210us         1  1.8210us  1.8210us  1.8210us  cudaEventElapsedTime
 0.00%  1.7800us         2     890ns     510ns  1.2700us  cudaEventDestroy
 0.00%  1.3870us         2     693ns     338ns  1.0490us  cuDeviceGetCount
 0.00%  1.2160us         1  1.2160us  1.2160us  1.2160us  cudaConfigureCall
 0.00%     339ns         1     339ns     339ns     339ns  cudaGetLastError