这个CUDA代码我想计算经过的时间。我对CUDA很陌生,所以去尝试了一些类似的API。
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
但我不知道将这些陈述放在下面的代码中,即我不知道如何安排这些代码。任何人都可以指导我在代码中插入这些语句的位置吗?
#include<stdio.h>
#define N 512
__global__ void add( int *a, int *b, int *c)
{
*c= *a+ *b;
}
int main(void)
{
int *a, *b, *c; // host copies of a, b, c
int *dev_a, *dev_b, *dev_c; // device copies of a, b, c
int size = N * sizeof(int); // we need space for 512 integers
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
cudaMalloc( (void**)&dev_c, size );
a = (int*)malloc( size );
b = (int*)malloc( size );
c = (int*)malloc( size );
//random_ints( a, N );
//random_ints( b, N );
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);
// launch add() kernel with N parallel blocks
add <<< N, 1 >>>( dev_a, dev_b, dev_c);
// copy device result back to host copy of c
cudaMemcpy( c, dev_c, size, cudaMemcpyDeviceToHost);
free( a ); free( b ); free( c );
cudaFree( dev_a);
cudaFree( dev_b);
cudaFree( dev_c);
}
答案 0 :(得分:2)
CUDA事件的时间测量如下:
假设您要测量内核add
的时间。
cudaEvent_t start,stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
add <<< N, 1 >>>( dev_a, dev_b, dev_c);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime,start,stop);
cout<<"\n\nElapsed Time = "<<elapsedTime<<" milliseconds";