下面是我的nvprof
结果的结果,我试图了解API calls
部分的含义。 API calls
中的第一个要花4.67456s,比GPU activities
中的第一个要长得多,为什么?
==25972== Profiling application: python view.py
==25972== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 98.62% 97.765ms 16999 5.7510us 2.6560us 11.744us _GLOBAL__N__61_tmpxft_00006356_00000000_9_nms_cuda_kernel_compute_52_cpp1_ii_4795a1ea::nms_forward_kernel(float*, float const *, float, int, int)
1.09% 1.0835ms 90 12.039us 992ns 48.799us [CUDA memcpy HtoD]
0.06% 58.240us 5 11.648us 11.392us 12.256us void thrust::cuda_cub::cub::RadixSortScanBinsKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, int>(int*, int)
0.06% 56.352us 2 28.176us 26.720us 29.632us void thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>(thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>*, bool=0 const *, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>**, bool=1*, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>**, int, int, thrust::cuda_cub::cub::GridEvenShare<thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>**>)
0.05% 52.672us 3 17.557us 16.576us 19.136us void thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>(thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>*, bool=1 const *, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>**, bool=1*, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>**, int, int, thrust::cuda_cub::cub::GridEvenShare<thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>**>)
0.03% 27.136us 1 27.136us 27.136us 27.136us _GLOBAL__N__61_tmpxft_00006356_00000000_9_nms_cuda_kernel_compute_52_cpp1_ii_4795a1ea::data_preprocess_kernel(float const *, float*, int, int*)
0.03% 26.527us 2 13.263us 13.216us 13.311us void thrust::cuda_cub::cub::DeviceRadixSortUpsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int>(thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, bool=0*, thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, int, int, thrust::cuda_cub::cub::GridEvenShare<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *>)
0.02% 19.744us 3 6.5810us 5.4720us 8.5120us void thrust::cuda_cub::cub::DeviceRadixSortUpsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int>(thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, bool=1*, thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, int, int, thrust::cuda_cub::cub::GridEvenShare<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *>)
0.02% 18.528us 2 9.2640us 9.0880us 9.4400us [CUDA memcpy DtoH]
0.01% 8.2240us 1 8.2240us 8.2240us 8.2240us _GLOBAL__N__61_tmpxft_00006356_00000000_9_nms_cuda_kernel_compute_52_cpp1_ii_4795a1ea::data_postprocess_kernel(float const *, float*, int, int*)
0.00% 3.7120us 1 3.7120us 3.7120us 3.7120us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>(thrust::device_ptr<int>, int)
0.00% 3.3600us 1 3.3600us 3.3600us 3.3600us void kernelPointwiseApply1<TensorFillOp<float>, float, unsigned int, int=1>(OffsetInfo<TensorFillOp<float>, float, unsigned int>, float, float)
0.00% 2.9760us 1 2.9760us 2.9760us 2.9760us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<int*, int*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<int>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<int*, int*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<int>, thrust::cuda_cub::__transform::always_true_predicate>, long>(int*, thrust::cuda_cub::__transform::no_stencil_tag)
0.00% 2.5600us 1 2.5600us 2.5600us 2.5600us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<float*, float*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<float>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<float*, float*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<float>, thrust::cuda_cub::__transform::always_true_predicate>, long>(float*, thrust::cuda_cub::__transform::no_stencil_tag)
0.00% 2.3680us 1 2.3680us 2.3680us 2.3680us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__tabulate::functor<thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::system::detail::generic::sequence_detail::sequence_functor<int>, long>, long>, thrust::cuda_cub::__tabulate::functor<thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::system::detail::generic::sequence_detail::sequence_functor<int>, long>, long>(thrust::device_ptr<int>, thrust::detail::normal_iterator<thrust::device_ptr<int>>)
API calls: 69.38% 4.67456s 8 584.32ms 21.948us 4.66813s cudaMalloc
19.85% 1.33738s 1 1.33738s 1.33738s 1.33738s cudaDeviceReset
6.85% 461.19ms 16999 27.130us 4.3450us 2.3428ms cudaStreamCreate
2.18% 146.78ms 17019 8.6240us 5.5850us 590.15us cudaLaunchKernel
0.78% 52.472ms 16998 3.0860us 2.3880us 491.82us cudaEventRecord
0.48% 32.347ms 16998 1.9030us 1.6020us 579.51us cudaStreamWaitEvent
0.41% 27.471ms 16998 1.6160us 1.0150us 501.06us cudaEventCreate
0.02% 1.0187ms 47 21.674us 8.9530us 82.099us cudaMemcpyAsync
0.01% 859.57us 45 19.101us 6.6610us 60.919us cudaMemcpy
0.01% 737.22us 47 15.685us 3.5030us 54.214us cudaStreamSynchronize
0.01% 513.43us 278 1.8460us 427ns 69.612us cuDeviceGetAttribute
0.01% 391.43us 430 910ns 571ns 12.840us cudaGetDevice
0.01% 353.59us 3 117.86us 116.03us 120.19us cuDeviceTotalMem
0.00% 258.63us 2 129.32us 128.63us 130.00us cudaFree
0.00% 223.59us 2 111.79us 95.946us 127.64us cudaGetDeviceProperties
0.00% 139.32us 147 947ns 715ns 7.0800us cudaSetDevice
0.00% 130.12us 240 542ns 390ns 2.9830us cudaGetDeviceCount
0.00% 113.01us 3 37.669us 23.669us 49.539us cuDeviceGetName
0.00% 101.80us 1 101.80us 101.80us 101.80us cudaDeviceSynchronize
0.00% 67.069us 2 33.534us 27.864us 39.205us cudaLaunch
0.00% 22.799us 6 3.7990us 2.7200us 6.9700us cudaFuncGetAttributes
0.00% 12.063us 12 1.0050us 822ns 1.9320us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
0.00% 11.027us 23 479ns 403ns 754ns cudaPeekAtLastError
0.00% 5.5760us 5 1.1150us 493ns 2.9760us cuDeviceGetCount
0.00% 4.6710us 2 2.3350us 1.3820us 3.2890us cuInit
0.00% 4.6090us 6 768ns 683ns 1.0360us cudaDeviceGetAttribute
0.00% 3.9340us 1 3.9340us 3.9340us 3.9340us cuDeviceGetPCIBusId
0.00% 3.5570us 5 711ns 463ns 1.1720us cudaSetupArgument
0.00% 3.0960us 4 774ns 446ns 1.2680us cuDeviceGet
0.00% 3.0570us 2 1.5280us 1.2220us 1.8350us cudaConfigureCall
0.00% 2.2150us 2 1.1070us 975ns 1.2400us cuDriverGetVersion
0.00% 624ns 1 624ns 624ns 624ns cudaGetLastError
0.00% 526ns 1 526ns 526ns 526ns cuDeviceGetUuid
答案 0 :(得分:2)
您使用的第一个API函数包括CUDA惰性上下文建立开销。在这种情况下,cudaMalloc
调用可能与推力设备矢量构造有关。
在您的情况下,初始化CUDA上下文似乎需要4.6秒。