Question

我想计算应运行1次以上内核的时间，要处理的每个内核要处理的数据都不同。我的代码如下，不应计算cudaMemcpy的时间。< / p>

1 cudaEvent_t start;
2 error = cudaEventCreate(&start);
3 cudaEvent_t stop;
4 error = cudaEventCreate(&stop);
6 float msecTotal = 0.0f;
7 int nIter = 300;
8 for (int j = 0; j < nIter; j++)
9 {            
10      cudaMemcpy(...);
        // Record the start event
11      error = cudaEventRecord(start, NULL);
12      matrixMulCUDA1<<< grid, threads >>>(...);
       // Record the stop event
13      error = cudaEventRecord(stop, NULL);
14      error = cudaEventSynchronize(stop);
15      float msec = 0.0f;
16      error = cudaEventElapsedTime(&msec, start, stop);
17      msecTotal+=msec;
18 }
19 cout<<"Total time = "<<msecTotal<<endl;

公平地说，对比度算法应该低于：

1 cudaEvent_t start;
2 error = cudaEventCreate(&start);
3 cudaEvent_t stop;
4 error = cudaEventCreate(&stop);
6 float msecTotal = 0.0f;
7 int nIter = 300;
8 for (int j = 0; j < nIter; j++)
9 {
        // Record the start event    
11      error = cudaEventRecord(start, NULL);
12      matrixMulCUDA2<<< grid, threads >>>(...);
       // Record the stop event
13      error = cudaEventRecord(stop, NULL);
14      error = cudaEventSynchronize(stop);
15      float msec = 0.0f;
16      error = cudaEventElapsedTime(&msec, start, stop);
17      msecTotal+=msec;
18 }
19 cout<<"Total time = "<<msecTotal<<endl;

我的问题是方法是对的吗？因为我不确定。显然，时间应该比平时更长。

Answer 1

你应该得到类似的结果。通过记录内核启动周围的事件，你肯定只测量在内核中花费的时间而不是在memcpy上花费的任何时间。

我唯一不确定的是，通过在循环的每次迭代中调用cudaEventSynchronize（），你破坏了CPU / GPU并发性，这对于获得良好的性能实际上非常重要。如果必须单独对每个内核调用计时（而不是围绕内核调用放置n迭代的for循环而不是整个操作），您可能希望分配更多的CUDA事件。如果你走这条路，你不需要每个循环迭代2个事件 - 你需要将操作括起来两个，并且记录每个循环迭代只需要一个CUDA事件。然后，可以通过在相邻记录的事件上调用cudaEventElapsedTime（）来计算任何给定内核调用的时间。

记录N次事件之间的GPU时间：

cudaEvent_t events[N+2];

cudaEventRecord( events[0], NULL ); // record first event
for (j = 0; j < nIter; j++ ) {
    // invoke kernel, or do something else you want to time
    // cudaEventRecord( events[j+1], NULL );
}
cudaEventRecord( events[j], NULL );
// to compute the time taken for operation i, call:
float ms;
cudaEventElapsedTime( &ms, events[i+1], events[i] );

定时CUDA内核应该执行超过1次

1 个答案: