CUDA分析-nvprof结果中的API调用是什么意思?

时间:2019-01-27 19:30:04

标签: cuda

下面是我的nvprof结果的结果,我试图了解API calls部分的含义。 API calls中的第一个要花4.67456s,比GPU activities中的第一个要长得多,为什么?

==25972== Profiling application: python view.py
==25972== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   98.62%  97.765ms     16999  5.7510us  2.6560us  11.744us  _GLOBAL__N__61_tmpxft_00006356_00000000_9_nms_cuda_kernel_compute_52_cpp1_ii_4795a1ea::nms_forward_kernel(float*, float const *, float, int, int)
                    1.09%  1.0835ms        90  12.039us     992ns  48.799us  [CUDA memcpy HtoD]
                    0.06%  58.240us         5  11.648us  11.392us  12.256us  void thrust::cuda_cub::cub::RadixSortScanBinsKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, int>(int*, int)
                    0.06%  56.352us         2  28.176us  26.720us  29.632us  void thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>(thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>*, bool=0 const *, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>**, bool=1*, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>**, int, int, thrust::cuda_cub::cub::GridEvenShare<thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>**>)
                    0.05%  52.672us         3  17.557us  16.576us  19.136us  void thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>(thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>*, bool=1 const *, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>**, bool=1*, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>**, int, int, thrust::cuda_cub::cub::GridEvenShare<thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>**>)
                    0.03%  27.136us         1  27.136us  27.136us  27.136us  _GLOBAL__N__61_tmpxft_00006356_00000000_9_nms_cuda_kernel_compute_52_cpp1_ii_4795a1ea::data_preprocess_kernel(float const *, float*, int, int*)
                    0.03%  26.527us         2  13.263us  13.216us  13.311us  void thrust::cuda_cub::cub::DeviceRadixSortUpsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int>(thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, bool=0*, thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, int, int, thrust::cuda_cub::cub::GridEvenShare<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *>)
                    0.02%  19.744us         3  6.5810us  5.4720us  8.5120us  void thrust::cuda_cub::cub::DeviceRadixSortUpsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int>(thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, bool=1*, thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, int, int, thrust::cuda_cub::cub::GridEvenShare<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *>)
                    0.02%  18.528us         2  9.2640us  9.0880us  9.4400us  [CUDA memcpy DtoH]
                    0.01%  8.2240us         1  8.2240us  8.2240us  8.2240us  _GLOBAL__N__61_tmpxft_00006356_00000000_9_nms_cuda_kernel_compute_52_cpp1_ii_4795a1ea::data_postprocess_kernel(float const *, float*, int, int*)
                    0.00%  3.7120us         1  3.7120us  3.7120us  3.7120us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>(thrust::device_ptr<int>, int)
                    0.00%  3.3600us         1  3.3600us  3.3600us  3.3600us  void kernelPointwiseApply1<TensorFillOp<float>, float, unsigned int, int=1>(OffsetInfo<TensorFillOp<float>, float, unsigned int>, float, float)
                    0.00%  2.9760us         1  2.9760us  2.9760us  2.9760us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<int*, int*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<int>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<int*, int*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<int>, thrust::cuda_cub::__transform::always_true_predicate>, long>(int*, thrust::cuda_cub::__transform::no_stencil_tag)
                    0.00%  2.5600us         1  2.5600us  2.5600us  2.5600us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<float*, float*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<float>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<float*, float*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<float>, thrust::cuda_cub::__transform::always_true_predicate>, long>(float*, thrust::cuda_cub::__transform::no_stencil_tag)
                    0.00%  2.3680us         1  2.3680us  2.3680us  2.3680us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__tabulate::functor<thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::system::detail::generic::sequence_detail::sequence_functor<int>, long>, long>, thrust::cuda_cub::__tabulate::functor<thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::system::detail::generic::sequence_detail::sequence_functor<int>, long>, long>(thrust::device_ptr<int>, thrust::detail::normal_iterator<thrust::device_ptr<int>>)
      API calls:   69.38%  4.67456s         8  584.32ms  21.948us  4.66813s  cudaMalloc
                   19.85%  1.33738s         1  1.33738s  1.33738s  1.33738s  cudaDeviceReset
                    6.85%  461.19ms     16999  27.130us  4.3450us  2.3428ms  cudaStreamCreate
                    2.18%  146.78ms     17019  8.6240us  5.5850us  590.15us  cudaLaunchKernel
                    0.78%  52.472ms     16998  3.0860us  2.3880us  491.82us  cudaEventRecord
                    0.48%  32.347ms     16998  1.9030us  1.6020us  579.51us  cudaStreamWaitEvent
                    0.41%  27.471ms     16998  1.6160us  1.0150us  501.06us  cudaEventCreate
                    0.02%  1.0187ms        47  21.674us  8.9530us  82.099us  cudaMemcpyAsync
                    0.01%  859.57us        45  19.101us  6.6610us  60.919us  cudaMemcpy
                    0.01%  737.22us        47  15.685us  3.5030us  54.214us  cudaStreamSynchronize
                    0.01%  513.43us       278  1.8460us     427ns  69.612us  cuDeviceGetAttribute
                    0.01%  391.43us       430     910ns     571ns  12.840us  cudaGetDevice
                    0.01%  353.59us         3  117.86us  116.03us  120.19us  cuDeviceTotalMem
                    0.00%  258.63us         2  129.32us  128.63us  130.00us  cudaFree
                    0.00%  223.59us         2  111.79us  95.946us  127.64us  cudaGetDeviceProperties
                    0.00%  139.32us       147     947ns     715ns  7.0800us  cudaSetDevice
                    0.00%  130.12us       240     542ns     390ns  2.9830us  cudaGetDeviceCount
                    0.00%  113.01us         3  37.669us  23.669us  49.539us  cuDeviceGetName
                    0.00%  101.80us         1  101.80us  101.80us  101.80us  cudaDeviceSynchronize
                    0.00%  67.069us         2  33.534us  27.864us  39.205us  cudaLaunch
                    0.00%  22.799us         6  3.7990us  2.7200us  6.9700us  cudaFuncGetAttributes
                    0.00%  12.063us        12  1.0050us     822ns  1.9320us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
                    0.00%  11.027us        23     479ns     403ns     754ns  cudaPeekAtLastError
                    0.00%  5.5760us         5  1.1150us     493ns  2.9760us  cuDeviceGetCount
                    0.00%  4.6710us         2  2.3350us  1.3820us  3.2890us  cuInit
                    0.00%  4.6090us         6     768ns     683ns  1.0360us  cudaDeviceGetAttribute
                    0.00%  3.9340us         1  3.9340us  3.9340us  3.9340us  cuDeviceGetPCIBusId
                    0.00%  3.5570us         5     711ns     463ns  1.1720us  cudaSetupArgument
                    0.00%  3.0960us         4     774ns     446ns  1.2680us  cuDeviceGet
                    0.00%  3.0570us         2  1.5280us  1.2220us  1.8350us  cudaConfigureCall
                    0.00%  2.2150us         2  1.1070us     975ns  1.2400us  cuDriverGetVersion
                    0.00%     624ns         1     624ns     624ns     624ns  cudaGetLastError
                    0.00%     526ns         1     526ns     526ns     526ns  cuDeviceGetUuid

1 个答案:

答案 0 :(得分:2)

您使用的第一个API函数包括CUDA惰性上下文建立开销。在这种情况下,cudaMalloc调用可能与推力设备矢量构造有关。

在您的情况下,初始化CUDA上下文似乎需要4.6秒。