是GNU gprof车吗?

时间:2014-06-22 14:50:54

标签: c profiling profiler gprof

我有一个C程序通过函数pi_calcPiItem()调用函数pi_calcPiBlock 600000000次。因此,要分析我使用GNU gprof的函数所花费的时间。结果似乎是错误的,因为所有调用都归因于main()。此外,调用图没有任何意义:

Each sample counts as 0.01 seconds.
  %   cumulative   self              self     total
 time   seconds   seconds    calls  Ts/call  Ts/call  name
 61.29      9.28     9.28                             pi_calcPiItem
 15.85     11.68     2.40                             pi_calcPiBlock
 11.96     13.49     1.81                             _mcount_private
  9.45     14.92     1.43                             __fentry__
  1.45     15.14     0.22                             pow
  0.00     15.14     0.00 600000000     0.00     0.00  main

                        Call graph


granularity: each sample hit covers 4 byte(s) for 0.07% of 15.14 seconds

index % time    self  children    called     name
                                                 <spontaneous>
[1]     61.3    9.28    0.00                 pi_calcPiItem [1]
-----------------------------------------------
                                                 <spontaneous>
[2]     15.9    2.40    0.00                 pi_calcPiBlock [2]
                0.00    0.00 600000000/600000000     main [6]
-----------------------------------------------
                                                 <spontaneous>
[3]     12.0    1.81    0.00                 _mcount_private [3]
-----------------------------------------------
                                                 <spontaneous>
[4]      9.4    1.43    0.00                 __fentry__ [4]
-----------------------------------------------
                                                 <spontaneous>
[5]      1.5    0.22    0.00                 pow [5]
-----------------------------------------------
                                   6             main [6]
                0.00    0.00 600000000/600000000     pi_calcPiBlock [2]
[6]      0.0    0.00    0.00 600000000+6       main [6]
                                   6             main [6]
-----------------------------------------------

这是一个错误还是我必须以某种方式配置程序?

<spontaneous>是什么意思?

编辑(更深入了解您)

代码全是关于pi的计算:

#define PI_BLOCKSIZE (100000000)
#define PI_BLOCKCOUNT (6)
#define PI_THRESHOLD (PI_BLOCKSIZE * PI_BLOCKCOUNT)

int32_t main(int32_t argc, char* argv[]) {
  double result;

  for ( int32_t i = 0; i < PI_THRESHOLD; i += PI_BLOCKSIZE ) {
    pi_calcPiBlock(&result, i, i + PI_BLOCKSIZE);
  }

  printf("pi = %f\n",result);
  return 0;
}

static void pi_calcPiBlock(double* result, int32_t start, int32_t end) {
  double piItem;

  for ( int32_t i = start; i < end; ++i ) {
    pi_calcPiItem(&piItem, i);
    *result += piItem;
  }  
}    

static void pi_calcPiItem(double* piItem, int32_t index) {
  *piItem = 4.0 * (pow(-1.0,index) / (2.0 * index + 1.0));
}

这就是我得到结果的方式(在Cygwin的帮助下在Windows上执行):

> gcc -std=c99 -o pi *.c -pg -fno-inline-small-functions
> ./pi.exe
> gprof.exe pi.exe

2 个答案:

答案 0 :(得分:2)

尝试:

  1. 使用noinlinenoclone函数属性代替-fno-inline-small-functions
    • 通过反汇编main我可以看到-fno-inline-small-functions并未停止内联
  2. 静态链接您的计划(-static
  3. 您还应该在result
  4. 中将0.0初始化为main

    这适用于Linux,x86-64:

    #include <stdio.h>
    #include <stdint.h>
    #include <math.h>
    
    #define PI_BLOCKSIZE (100000000)
    #define PI_BLOCKCOUNT (6)
    #define PI_THRESHOLD (PI_BLOCKSIZE * PI_BLOCKCOUNT)
    
    static void pi_calcPiItem(double* piItem, int32_t index);
    static void pi_calcPiBlock(double* result, int32_t start, int32_t end);
    
    int32_t main(int32_t argc, char* argv[]) {
      double result;
    
      result = 0.0;
      for ( int32_t i = 0; i < PI_THRESHOLD; i += PI_BLOCKSIZE ) {
        pi_calcPiBlock(&result, i, i + PI_BLOCKSIZE);
      }
    
      printf("pi = %f\n",result);
      return 0;
    }
    
    __attribute__((noinline, noclone))
    static void pi_calcPiBlock(double* result, int32_t start, int32_t end) {
      double piItem;
    
      for ( int32_t i = start; i < end; ++i ) {
        pi_calcPiItem(&piItem, i);
        *result += piItem;
      }  
    }    
    
    __attribute__((noinline, noclone))
    static void pi_calcPiItem(double* piItem, int32_t index) {
      *piItem = 4.0 * (pow(-1.0,index) / (2.0 * index + 1.0));
    }
    

    构建代码

    $ cc pi.c -o pi -Os -Wall -g3 -I. -std=c99 -pg -static -lm
    

    <强>输出

    $ ./pi && gprof ./pi
    pi = 3.141593
    Flat profile:
    
    Each sample counts as 0.01 seconds.
      %   cumulative   self              self     total           
     time   seconds   seconds    calls  ns/call  ns/call  name    
     85.61     22.55    22.55                             __ieee754_pow_sse2
      4.75     23.80     1.25                             pow
      4.14     24.89     1.09 600000000     1.82     1.82  pi_calcPiItem
      2.54     25.56     0.67                             __exp1
      0.91     25.80     0.24                             pi_calcPiBlock
      0.53     25.94     0.14                             matherr
      0.47     26.07     0.13                             __lseek_nocancel
      0.38     26.17     0.10                             frame_dummy
      0.34     26.26     0.09                             __ieee754_exp_sse2
      0.32     26.34     0.09                             __profile_frequency
      0.00     26.34     0.00        1     0.00     0.00  main
    
    
                 Call graph (explanation follows)
    
    
    granularity: each sample hit covers 2 byte(s) for 0.04% of 26.34 seconds
    
    index % time    self  children    called     name
                                                     <spontaneous>
    [1]     85.6   22.55    0.00                 __ieee754_pow_sse2 [1]
    -----------------------------------------------
                                                     <spontaneous>
    [2]      5.0    0.24    1.09                 pi_calcPiBlock [2]
                    1.09    0.00 600000000/600000000     pi_calcPiItem [4]
    -----------------------------------------------
                                                     <spontaneous>
    [3]      4.7    1.25    0.00                 pow [3]
    -----------------------------------------------
                    1.09    0.00 600000000/600000000     pi_calcPiBlock [2]
    [4]      4.1    1.09    0.00 600000000         pi_calcPiItem [4]
    -----------------------------------------------
                                                     <spontaneous>
    [5]      2.5    0.67    0.00                 __exp1 [5]
    -----------------------------------------------
                                                     <spontaneous>
    [6]      0.5    0.14    0.00                 matherr [6]
    -----------------------------------------------
                                                     <spontaneous>
    [7]      0.5    0.13    0.00                 __lseek_nocancel [7]
    -----------------------------------------------
                                                     <spontaneous>
    [8]      0.4    0.10    0.00                 frame_dummy [8]
    -----------------------------------------------
                                                     <spontaneous>
    [9]      0.3    0.09    0.00                 __ieee754_exp_sse2 [9]
    -----------------------------------------------
                                                     <spontaneous>
    [10]     0.3    0.09    0.00                 __profile_frequency [10]
    -----------------------------------------------
                    0.00    0.00       1/1           __libc_start_main [827]
    [11]     0.0    0.00    0.00       1         main [11]
    -----------------------------------------------
    

    <强>评论

    正如预期的那样,pow()是瓶颈。当pi正在运行时,perf top(基于采样的系统探查器)也会显示__ieee754_pow_sse2占用60%以上的CPU。将pow(-1.0,index)更改为((i & 1) ? -1.0 : 1.0)为@Mike Dunlavey建议使代码大约快4倍。

答案 1 :(得分:0)

在&#39; man gprof&#39;页面,这里是&#34; spontaneous&#34;:

的解释
  

没有自己描述的父母将有时间   他们的异形儿童传播给他们,但他们似乎是   在调用图列表中自发调用,而不会   他们的时间进一步传播。同样,信号捕获者,甚至   虽然描述,但似乎是自发的(尽管更多   不明原因)。信号捕获者的任何异形儿童都应该   除非捕获信号,否则他们的时间会正常传播   在执行分析例程期间调用,在这种情况下   一切都迷失了。