我遇到了一个我不知道如何解决它的问题,如果可能的话,你能帮助我吗?
我想测量两个1D阵列的执行时间, 所以我尝试在我的GF820 M上实现并执行内核,添加内核的结果(我把它们放在一个文件中)是正确的,但是这个内核获得的executin的时间值非常低,我对这些值的质疑如果有的话是否正确。
我认为问题在于指令执行时间测量,但我遵循CUDA编程指南中存在的相同语法。
执行在GF 820 M上完成:
表示WIDTH = 20480,线程数= 512;时间= 0,01812毫秒
并且对于WIDTH = 20480和线程数= 1024;时间= 0,021920毫秒
提前感谢您提供的任何帮助。
代码:
__global__ void kernel_sum(float *x, float *y, float *z, int size) {
int gtid = threadIdx.x + blockIdx.x * blockDim.x;
z[gtid] = x[gtid] + y[gtid];
}
int main ()
{
cudaError_t cudaStatus;
float *array1_d , *array2_d ,*M_result_array_d ; // device array
const int WIDTH=20480 ;
float *array1_h = (float *)malloc(WIDTH*sizeof(float));
float *array2_h= (float*)malloc(WIDTH*sizeof(float));
float *M_result_array_h = (float *)malloc(WIDTH*sizeof(float));
//remplissage des deux matrices d'entree
for ( int i = 0 ; i<WIDTH; i++ )
{ array1_h[i]=i;
array2_h[i]=i; }
// Allocate GPU buffers for 2 matrices (two input, one output)
cudaStatus = cudaMalloc((void **) &array1_d , WIDTH*sizeof (float));
if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); }
cudaStatus = cudaMalloc((void **) &array2_d , WIDTH*sizeof (float));
if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); }
//allocating memory for resultent device array
cudaStatus = cudaMalloc((void **) &M_result_array_d , WIDTH*sizeof (float) ) ;
if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMalloc failed!"); }
//copy host array to device array;
cudaStatus = cudaMemcpy ( array1_d , array1_h , WIDTH*sizeof (float) , cudaMemcpyHostToDevice ) ;
if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy 0 failed!"); }
cudaStatus = cudaMemcpy ( array2_d , array2_h,WIDTH*sizeof(float),cudaMemcpyHostToDevice ) ;
if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy 1 failed!"); }
//lancer kernel
const int t=512;
int NUMBER_OF_BLOCKS = WIDTH/t;
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
kernel_sum <<<NUMBER_OF_BLOCKS,t>>> ( array1_d , array2_d ,M_result_array_d ,WIDTH) ;
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaStatus = cudaGetLastError();if (cudaStatus != cudaSuccess) {fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); }
//afffichage de temps du kernel et enregister dans un fichier
printf ("N=%5d NT=%5d Time=%5f \n",WIDTH,t,time);
//copy back result_array_d to result_array_h
cudaStatus = cudaMemcpy(M_result_array_h , M_result_array_d , WIDTH*sizeof(float) ,cudaMemcpyDeviceToHost) ;
if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy result from GPU to host failed!"); }
/*//print result
printf("A \n");
for ( int i = 0 ; i<WIDTH ; i++ )
{ printf("%f ", array1_h[i] );printf ("\n") ;
} printf ("\n") ;
printf ("\n");
printf("B \n"); for ( int i = 0 ; i<WIDTH; i++ )
{ printf("%f ", array2_h[i] );printf ("\n") ;
} printf ("\n") ;
printf ("\n");
printf("Resultat \n");
for (int i = 0 ; i<WIDTH ; i++ )
{ printf ("%f ",M_result_array_h[i] ) ;printf ("\n") ; }
printf ("\n") ;
printf ("\n");
*/
FILE* fichier = NULL;
fichier = fopen("resultat.text", "a+");
if (fichier != NULL)
{ for (int i = 0 ; i<WIDTH ; i++ )
fprintf(fichier, "%5f \n ",M_result_array_h[i]);
fprintf(fichier, "\n ");
fclose(fichier);
}
else {printf("Impossible d'ouvrir le fichier"); }
cudaFree(array1_d);
cudaFree(array2_d);
cudaFree(M_result_array_d);
system("pause") ;
}
答案 0 :(得分:1)
我认为这些时间有意义。但如果您真的想知道内核的性能,那么使用 nvprof 和 nvvp 比使用 cudaEventElapsedTime 的时间更好。
这是在Gforce Titan X中使用nvprof:
执行的==20602== NVPROF is profiling process 20602, command: ./test
N=20480 NT= 512 Time=0.026720
==20602== Profiling application: ./test
==20602== Profiling result:
Time(%) Time Calls Avg Min Max Name
63.50% 19.648us 2 9.8240us 9.7920us 9.8560us [CUDA memcpy HtoD]
28.75% 8.8960us 1 8.8960us 8.8960us 8.8960us [CUDA memcpy DtoH]
7.76% 2.4000us 1 2.4000us 2.4000us 2.4000us kernel_sum(float*, float*, float*, int)
==20602== API calls:
Time(%) Time Calls Avg Min Max Name
97.28% 156.96ms 3 52.321ms 3.1070us 156.95ms cudaMalloc
2.15% 3.4675ms 332 10.444us 180ns 437.61us cuDeviceGetAttribute
0.20% 317.27us 4 79.318us 79.038us 79.945us cuDeviceTotalMem
0.17% 270.27us 4 67.567us 64.960us 74.955us cuDeviceGetName
0.11% 171.56us 3 57.185us 4.4910us 142.25us cudaFree
0.07% 117.87us 3 39.288us 29.311us 57.825us cudaMemcpy
0.01% 20.980us 1 20.980us 20.980us 20.980us cudaLaunch
0.00% 5.6630us 1 5.6630us 5.6630us 5.6630us cudaEventSynchronize
0.00% 5.1940us 2 2.5970us 2.2760us 2.9180us cudaEventRecord
0.00% 3.1650us 2 1.5820us 542ns 2.6230us cudaEventCreate
0.00% 3.0440us 4 761ns 167ns 2.3220us cudaSetupArgument
0.00% 2.6610us 8 332ns 204ns 619ns cuDeviceGet
0.00% 1.8210us 1 1.8210us 1.8210us 1.8210us cudaEventElapsedTime
0.00% 1.7800us 2 890ns 510ns 1.2700us cudaEventDestroy
0.00% 1.3870us 2 693ns 338ns 1.0490us cuDeviceGetCount
0.00% 1.2160us 1 1.2160us 1.2160us 1.2160us cudaConfigureCall
0.00% 339ns 1 339ns 339ns 339ns cudaGetLastError