Question

我有一个C程序通过函数pi_calcPiItem()调用函数pi_calcPiBlock 600000000次。因此，要分析我使用GNU gprof的函数所花费的时间。结果似乎是错误的，因为所有调用都归因于main()。此外，调用图没有任何意义：

Each sample counts as 0.01 seconds.
  %   cumulative   self              self     total
 time   seconds   seconds    calls  Ts/call  Ts/call  name
 61.29      9.28     9.28                             pi_calcPiItem
 15.85     11.68     2.40                             pi_calcPiBlock
 11.96     13.49     1.81                             _mcount_private
  9.45     14.92     1.43                             __fentry__
  1.45     15.14     0.22                             pow
  0.00     15.14     0.00 600000000     0.00     0.00  main

                        Call graph


granularity: each sample hit covers 4 byte(s) for 0.07% of 15.14 seconds

index % time    self  children    called     name
                                                 <spontaneous>
[1]     61.3    9.28    0.00                 pi_calcPiItem [1]
-----------------------------------------------
                                                 <spontaneous>
[2]     15.9    2.40    0.00                 pi_calcPiBlock [2]
                0.00    0.00 600000000/600000000     main [6]
-----------------------------------------------
                                                 <spontaneous>
[3]     12.0    1.81    0.00                 _mcount_private [3]
-----------------------------------------------
                                                 <spontaneous>
[4]      9.4    1.43    0.00                 __fentry__ [4]
-----------------------------------------------
                                                 <spontaneous>
[5]      1.5    0.22    0.00                 pow [5]
-----------------------------------------------
                                   6             main [6]
                0.00    0.00 600000000/600000000     pi_calcPiBlock [2]
[6]      0.0    0.00    0.00 600000000+6       main [6]
                                   6             main [6]
-----------------------------------------------

这是一个错误还是我必须以某种方式配置程序？

<spontaneous>是什么意思？

编辑（更深入了解您）

代码全是关于pi的计算：

#define PI_BLOCKSIZE (100000000)
#define PI_BLOCKCOUNT (6)
#define PI_THRESHOLD (PI_BLOCKSIZE * PI_BLOCKCOUNT)

int32_t main(int32_t argc, char* argv[]) {
  double result;

  for ( int32_t i = 0; i < PI_THRESHOLD; i += PI_BLOCKSIZE ) {
    pi_calcPiBlock(&result, i, i + PI_BLOCKSIZE);
  }

  printf("pi = %f\n",result);
  return 0;
}

static void pi_calcPiBlock(double* result, int32_t start, int32_t end) {
  double piItem;

  for ( int32_t i = start; i < end; ++i ) {
    pi_calcPiItem(&piItem, i);
    *result += piItem;
  }  
}    

static void pi_calcPiItem(double* piItem, int32_t index) {
  *piItem = 4.0 * (pow(-1.0,index) / (2.0 * index + 1.0));
}

这就是我得到结果的方式（在Cygwin的帮助下在Windows上执行）：

> gcc -std=c99 -o pi *.c -pg -fno-inline-small-functions
> ./pi.exe
> gprof.exe pi.exe

Answer 1

尝试：

使用noinline，noclone函数属性代替-fno-inline-small-functions
- 通过反汇编main我可以看到-fno-inline-small-functions并未停止内联
静态链接您的计划（-static）
您还应该在result

0.0

main

这适用于Linux，x86-64：

#include <stdio.h>
#include <stdint.h>
#include <math.h>

#define PI_BLOCKSIZE (100000000)
#define PI_BLOCKCOUNT (6)
#define PI_THRESHOLD (PI_BLOCKSIZE * PI_BLOCKCOUNT)

static void pi_calcPiItem(double* piItem, int32_t index);
static void pi_calcPiBlock(double* result, int32_t start, int32_t end);

int32_t main(int32_t argc, char* argv[]) {
  double result;

  result = 0.0;
  for ( int32_t i = 0; i < PI_THRESHOLD; i += PI_BLOCKSIZE ) {
    pi_calcPiBlock(&result, i, i + PI_BLOCKSIZE);
  }

  printf("pi = %f\n",result);
  return 0;
}

__attribute__((noinline, noclone))
static void pi_calcPiBlock(double* result, int32_t start, int32_t end) {
  double piItem;

  for ( int32_t i = start; i < end; ++i ) {
    pi_calcPiItem(&piItem, i);
    *result += piItem;
  }  
}    

__attribute__((noinline, noclone))
static void pi_calcPiItem(double* piItem, int32_t index) {
  *piItem = 4.0 * (pow(-1.0,index) / (2.0 * index + 1.0));
}

构建代码

$ cc pi.c -o pi -Os -Wall -g3 -I. -std=c99 -pg -static -lm

<强>输出

$ ./pi && gprof ./pi
pi = 3.141593
Flat profile:

Each sample counts as 0.01 seconds.
  %   cumulative   self              self     total           
 time   seconds   seconds    calls  ns/call  ns/call  name    
 85.61     22.55    22.55                             __ieee754_pow_sse2
  4.75     23.80     1.25                             pow
  4.14     24.89     1.09 600000000     1.82     1.82  pi_calcPiItem
  2.54     25.56     0.67                             __exp1
  0.91     25.80     0.24                             pi_calcPiBlock
  0.53     25.94     0.14                             matherr
  0.47     26.07     0.13                             __lseek_nocancel
  0.38     26.17     0.10                             frame_dummy
  0.34     26.26     0.09                             __ieee754_exp_sse2
  0.32     26.34     0.09                             __profile_frequency
  0.00     26.34     0.00        1     0.00     0.00  main


             Call graph (explanation follows)


granularity: each sample hit covers 2 byte(s) for 0.04% of 26.34 seconds

index % time    self  children    called     name
                                                 <spontaneous>
[1]     85.6   22.55    0.00                 __ieee754_pow_sse2 [1]
-----------------------------------------------
                                                 <spontaneous>
[2]      5.0    0.24    1.09                 pi_calcPiBlock [2]
                1.09    0.00 600000000/600000000     pi_calcPiItem [4]
-----------------------------------------------
                                                 <spontaneous>
[3]      4.7    1.25    0.00                 pow [3]
-----------------------------------------------
                1.09    0.00 600000000/600000000     pi_calcPiBlock [2]
[4]      4.1    1.09    0.00 600000000         pi_calcPiItem [4]
-----------------------------------------------
                                                 <spontaneous>
[5]      2.5    0.67    0.00                 __exp1 [5]
-----------------------------------------------
                                                 <spontaneous>
[6]      0.5    0.14    0.00                 matherr [6]
-----------------------------------------------
                                                 <spontaneous>
[7]      0.5    0.13    0.00                 __lseek_nocancel [7]
-----------------------------------------------
                                                 <spontaneous>
[8]      0.4    0.10    0.00                 frame_dummy [8]
-----------------------------------------------
                                                 <spontaneous>
[9]      0.3    0.09    0.00                 __ieee754_exp_sse2 [9]
-----------------------------------------------
                                                 <spontaneous>
[10]     0.3    0.09    0.00                 __profile_frequency [10]
-----------------------------------------------
                0.00    0.00       1/1           __libc_start_main [827]
[11]     0.0    0.00    0.00       1         main [11]
-----------------------------------------------

<强>评论

正如预期的那样，pow()是瓶颈。当pi正在运行时，perf top（基于采样的系统探查器）也会显示__ieee754_pow_sse2占用60％以上的CPU。将pow(-1.0,index)更改为((i & 1) ? -1.0 : 1.0)为@Mike Dunlavey建议使代码大约快4倍。

Answer 2

在＆＃39; man gprof＆＃39;页面，这里是＆＃34; spontaneous＆＃34;：

的解释

没有自己描述的父母将有时间他们的异形儿童传播给他们，但他们似乎是在调用图列表中自发调用，而不会他们的时间进一步传播。同样，信号捕获者，甚至虽然描述，但似乎是自发的（尽管更多不明原因）。信号捕获者的任何异形儿童都应该除非捕获信号，否则他们的时间会正常传播在执行分析例程期间调用，在这种情况下一切都迷失了。

是GNU gprof车吗？

2 个答案: