Question

我已经编写了一些C ++代码来测试C ++和嵌入式汇编程序的时间。我最初只是在玩它，但是后来我注意到，每次我运行程序时，都会得到不同的结果。有时C ++更快，有时内联汇编代码更快，有时它们都一样。

这是怎么回事？

以下是程序输出的代码：

#define TRIALS 1000000
#include <iostream>
using namespace std;
typedef std::chrono::high_resolution_clock Clock;
int main()
{
  auto t1 = Clock::now();
  auto t2 = Clock::now();
  int X3=17;
  int X2=17;
  int X4=17;
  int X=17;



  int sum=0;
  int avg=0;
  cout << "=================================" << endl;
  cout << "| var*=10;                      |" << endl;
  cout << "=================================" << endl;

  for( int i=0; i<TRIALS; i++ )
    {
      X3=17;
      t1 = Clock::now();  
      X3*=10;
      t2 = Clock::now();
      sum+=chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count();
    }
  avg=sum/TRIALS;
  cout << "| Product:  " << X3<< "  "<< avg << " nanoseconds |" << endl;
  cout << "=================================" << endl;
  cout << endl << endl;

  avg=sum=0;
  cout << "=================================" << endl;
  cout << "| use inline assembler with shl |" << endl;
  cout << "=================================" << endl;

  for( int i=0; i<TRIALS; i++ )
    {
      X=17;
      t1 = Clock::now();
      asm /*volatile*/ (
            "movl %0, %%eax;" // X->ax
            "shll %%eax;"// ax*=2
            "movl %%eax, %%ebx;" // ax->bx
            "shll %%eax;" // ax*=2
            "shll %%eax;" // ax*=2
            "add %%ebx, %%eax;" // bx+ax->ax
            : "=a" (X)
            : "a" (X)
            : "%ebx"
            );
      t2 = Clock::now();
      sum+=chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count();
    }
  avg=sum/TRIALS;
  cout << "| Product:  " << X << "  "<< avg << " nanoseconds |" << endl;
  cout << "=================================" << endl;
  cout << endl << endl;
  avg=sum=0;

  cout << "=================================" << endl;
  cout << "| var=var*10                    |" << endl;
  cout << "=================================" << endl;

  for( int i=0; i<TRIALS; i++ )
    {
      X2=17;
      t1 = Clock::now();
      X2=X2*10;
      t2 = Clock::now();
      sum+=chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count();
    }
  avg=sum/TRIALS;
  cout << "| Product:  " << X3<< "  "<< avg << " nanoseconds |" << endl;
  cout << "=================================" << endl;
  cout << endl << endl;

  avg=sum=0;


  cout << "=================================" << endl;
  cout << "| use inline assembler with mul |" << endl;
  cout << "=================================" << endl;
  for( int i=0; i<TRIALS; i++ )
    {
      X4=17;
      t1 = Clock::now();
      asm  (
    "movl %0, %%eax;" // X->ax
    "movl $0x0A, %%ebx;" // 10->bx
    "mul %%ebx;" // 10*ax->ax
    : "=a" (X4)
    : "a" (X4)
    : "%ebx"
    );
      t2 = Clock::now();
      sum+=chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count();
    }
  avg=sum/TRIALS;
  cout << "| Product:  " << X4<< "  "<< avg << " nanoseconds |" << endl;
  cout << "=================================" << endl;
  cout << endl;

  return(0);
}

程序输出1：

=================================
| var*=10;                      |
=================================
| Product:  170  50 nanoseconds |
=================================


=================================
| use inline assembler with shl |
=================================
| Product:  170  50 nanoseconds |
=================================


=================================
| var=var*10                    |
=================================
| Product:  170  50 nanoseconds |
=================================


=================================
| use inline assembler with mul |
=================================
| Product:  170  50 nanoseconds |
=================================

输出2：

=================================
| var*=10;                      |
=================================
| Product:  170  62 nanoseconds |
=================================


=================================
| use inline assembler with shl |
=================================
| Product:  170  57 nanoseconds |
=================================


=================================
| var=var*10                    |
=================================
| Product:  170  59 nanoseconds |
=================================


=================================
| use inline assembler with mul |
=================================
| Product:  170  58 nanoseconds |
=================================

Answer 1

这些更像是提示，而不是“只是”一种解决方案：

1）将TRAILS提升几个数量级，以实际测量几秒钟范围内的物体

2）重复测量几次（n = 100或更多），并取平均值（如果需要统计，平均值误差= rms / sqrt（n））

3）实际上测量您要测量的东西：至少仅将您感兴趣的代码放入TRAILS循环中，即：

t1 = Clock::now();  
for( int i=0; i<TRIALS; i++ )
    {
     ... only code relevant for your calculation here ...
    }
t2 = Clock::now();
sum = chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count();

4），最后，考虑一下Godbolt编译器资源管理器服务https://godbolt.org/，您可以在其中检查各种优化器设置的代码的汇编器输出。对于像您（我尝试过的）一样简单的代码，它只是做（使用-O3）：mov eax,170，所以您会看到：编译器很聪明，您无法轻易地用内联汇编器击败他！对于非平凡的例子，肯定是这种情况。

为什么此代码中的内联汇编程序有时会更快，而有时会更慢？每次运行的执行时间差异很大

1 个答案: