我想知道如何找到应用程序花费更多时间的确切位置。它是带有CUDA调用的C ++代码,所以从C ++代码开始,我创建了调用CUDA代码的包装器。定时C ++代码,给出5秒的执行时间,但是如果我在Nsight中分析代码,则内核需要8ms。怎么可能呢?
来自c++
代码:
double start_divide = get_host_current_time();
callDivideKernel( keep, d_a, d_A_N );
double end_divide = get_host_current_time();
printf("divideKernel : %g\n", end_divide - start_divide);
cu
档案:
void callDivideKernel(int N, float* A, int* A_N){
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
dim3 dimGrid(618,128);
dim3 dimBlock(512);
cudaEventRecord(start, 0);
DivideKernel<<< dimGrid,dimBlock >>>(N, A, A_N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("callDividekernel = %f ms\n",time);
cudaThreadSynchronize();
}
__global__ void DivideKernel(int N, float* A, int* A_N){
int k = blockIdx.x * blockDim.x + threadIdx.x +
blockDim.x*gridDim.x*blockIdx.y;
int kmax = (N*(N+1))/2;
int row,col;
if(k < kmax){
row = (int)(sqrt(0.25+2.0*k)-0.5);
col = k - (row*(row+1))/2;
int val = max(1, A_N[row*N + col]);
A[row*N + col] /= (float)val;
}
}
结果:
callDividekernel = 7.111040 ms
divideKernel : 5.66533