Question

这是在某些代码上运行CUDA探查器（nvprof）的日志文件，它们混合了Thrust，cublas和curand。第一个是我写的内核，所以没问题。但是我不确定如何解释第2到第5行，这占用了大量的运行时间。

>  Time(%)   Time    Calls   Avg     Min     Max    Name  %      s       ms      ms      ms
>   
>  28.12     6.82    24,543.00   0.28    0.01    0.64   dev_update_dW1(doub....)
>  23.78     5.77    12,272.00   0.47    0.46    0.49   void thrust::system::cud....
>  14.32     3.47    12,272.00   0.28    0.28    0.29   void thrust::system::cud....
>  10.82     2.62    12,272.00   0.21    0.21    0.22   void thrust::system::cud....
>  4.93      1.20    24,544.00   0.05    0.05    0.05   void thrust::system::cud....
>  3.98      0.96    12,272.00   0.08    0.08    0.09   Act_dAct(double*, long, double*, double*)

第2到第5行完整打印在下面：

第二行： void thrust :: system :: cuda :: detail :: detail :: launch_closure_by_value＆gt ;, thrust :: counting_iterator＆lt; __ int64，thrust :: use_default ，thrust :: use_default，thrust :: use_default＆gt;，thrust :: null_type，thrust :: null_type，thrust :: null_type，thrust :: null_type，thrust :: null_type，thrust :: null_type，thrust :: null_type，thrust :: null_type＆gt;＆gt;，__ int64，thrust :: tuple，thrust :: detail :: normal_iterator，thrust :: system :: cuda :: detail :: tag，thrust :: use_default，thrust :: use_default＆gt;＆gt;，thrust :: system :: detail :: generic :: detail :: max_element_reduction＆gt ;, thrust :: system :: cuda :: detail :: detail :: blocked_thread_array＆gt;＆gt;（double）

第3行： void thrust :: system :: cuda :: detail :: detail :: launch_closure_by_value＆gt ;, thrust :: detail :: normal_iterator＆gt ;,推力：：null_type，thrust :: null_type，thrust :: null_type，thrust :: null_type，thrust :: null_type，thrust :: null_type，thrust :: null_type，thrust :: null_type＆gt;＆gt ;, unsigned int，thrust :: detail :: device_unary_transform_functor，thrust :: system :: cuda :: detail :: detail :: blocked_thread_array＆gt;＆gt;（double）

第4行： void thrust :: system :: cuda :: detail :: detail :: launch_closure_by_value＆gt;，double，thrust :: use_default＆gt;，__ int64，double ，thrust :: detail :: normal_iterator＆gt ;, thrust :: plus，thrust :: system :: cuda :: detail :: detail :: blocked_thread_array＆gt;＆gt;（exp_functor）

第5行： void thrust :: system :: cuda :: detail :: detail :: launch_closure_by_value，unsigned int，thrust :: detail :: device_generate_functor＆gt;，推力::系统:: CUDA ::详细::详细:: blocked_thread_array＆GT;＆GT;（双）

编辑：

我有这个函数（softmax），它使用max_element和transform_reduce

void Softmax_ThrustMatrix(thrust::device_vector<double>& mat, int Nrow, int Ncol, thrust::device_vector<double>& Outmat) {
thrust::device_vector<double> x(Ncol, 0.0);
thrust::device_vector<double> v(Ncol, 0.0);
thrust::device_vector<double>::iterator mx;
double tmp = 0.0, logsm=0.0;
dim3 grid, block;

block.x = 16;
block.y = 1;
grid.x = Ncol / block.x + 1;
grid.y = 1;

for ( int i=0; i < Nrow; i++ ) {
    GetRow<<<grid,block>>>(thrust::raw_pointer_cast(&mat[0]), i, Nrow, Ncol, thrust::raw_pointer_cast(&x[0]));

    mx = thrust::max_element(x.begin(), x.end());

    tmp = thrust::transform_reduce(x.begin(), x.end(), exp_functor(*mx), 0.0, thrust::plus<double>() );
    logsm = *mx + log(tmp);

    thrust::transform(x.begin(), x.end(), v.begin(), exp_functor(logsm));

    SetRow<<<grid,block>>>(thrust::raw_pointer_cast(&v[0]), i, Nrow, Ncol, thrust::raw_pointer_cast(&Outmat[0]));
    }
}

Answer 1

低级别的推力代码与CUDA代码没有任何不同（至少对于针对GPU的推力代码）。作为模板库，Thrust在源代码级别抽象了CUDA的许多方面，但是分析器不知道推力代码和普通cuda代码之间的任何区别。

第2-5行表示4次内核启动时的分析器数据。从他们的语法中可以明显看出，它们可能不是你写的内核 - 它们是在推力模板函数的深处发出的。

“发射闭合”是针对通过推力执行某些功能而启动的内核的推力。由于您在显示的代码中有3个推力调用，并且还显示您编写的GetRow和SetRow内核，并且这些内核未在任何位置显示在您的探查器输出中，因此不明显对我而言，您显示的探查器输出与您显示的代码有关。你没有显示调用做的内核出现在你的输出中的代码（dev_update_dW1和Act_dAct），所以我觉得你的代码很清楚显示对于您的探查器输出的进一步解释没有用。

无论如何，第2-5行表示由推力发起的CUDA内核，它们来自代码中的推力调用（某处）。

请注意，推力也可能用于其他一些非显而易见的目的，例如设备向量的实例化。

解释CUDA探查器日志文件

1 个答案: