Question

我想知道如何找到应用程序花费更多时间的确切位置。它是带有CUDA调用的C ++代码，所以从C ++代码开始，我创建了调用CUDA代码的包装器。定时C ++代码，给出5秒的执行时间，但是如果我在Nsight中分析代码，则内核需要8ms。怎么可能呢？

来自c++代码：

double start_divide = get_host_current_time();
callDivideKernel( keep, d_a, d_A_N );
double end_divide = get_host_current_time();
printf("divideKernel : %g\n", end_divide - start_divide);

cu档案：

void callDivideKernel(int N, float* A, int* A_N){

  cudaEvent_t start, stop;
  float time;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  dim3 dimGrid(618,128);
  dim3 dimBlock(512);

  cudaEventRecord(start, 0);
  DivideKernel<<< dimGrid,dimBlock >>>(N, A, A_N);
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&time, start, stop);
  printf("callDividekernel = %f ms\n",time);
  cudaThreadSynchronize();

}

__global__ void DivideKernel(int N, float* A, int* A_N){

  int k =  blockIdx.x * blockDim.x + threadIdx.x +
    blockDim.x*gridDim.x*blockIdx.y;

  int kmax = (N*(N+1))/2;
  int row,col;

  if(k < kmax){
    row = (int)(sqrt(0.25+2.0*k)-0.5); 
    col = k - (row*(row+1))/2;
    int val = max(1, A_N[row*N + col]);
    A[row*N + col] /= (float)val;
  }
}

结果：

callDividekernel = 7.111040 ms
divideKernel : 5.66533

CUDA在内核调用中花费的时间在哪里？

0 个答案: