我想检查算法在计算机视觉中的性能,我最后得到了这段基本代码,只是为了检查哪个循环是最快的。但我对结果没有任何解释。我通常会得到一个结果,显示double for循环比简单循环快3倍。如果我切换两个循环,我会得到相同的结果,这意味着第二个循环总是被优化...那么编译器会做出什么样的优化?
对不起,我知道这一定是个愚蠢的问题......
ulong k = 0;
auto start = std::chrono::high_resolution_clock::now();
for( uint i = 0; i < 1000000; ++i )
{
k++;
}
auto diff = std::chrono::high_resolution_clock::now() - start;
auto t1 = std::chrono::duration_cast<std::chrono::nanoseconds>(diff);
k = 0;
start = std::chrono::high_resolution_clock::now();
for( uint i = 0; i < 1000; ++i )
{
for( uint j = 0; j < 1000; ++j )
{
k++;
}
}
diff = std::chrono::high_resolution_clock::now() - start;
auto t2 = std::chrono::duration_cast<std::chrono::nanoseconds>(diff);
CL_PRINT( "Simple: ", t1.count() );
CL_PRINT( "Double: ", t2.count() );
如果我切换了两个循环,我会得到相同的结果,这意味着第二个循环总是被优化...
请注意,CL_PRINT
只是用于调试目的的宏。
另请注意,我使用以下选项编译代码:-O3 -msse4.1
答案 0 :(得分:2)
这里的答案是确切的时间变化。当我在我的机器上运行this code时,它有时会为第一个循环提供1000,而在其他时间为第二个循环提供1000。当计时器结束时,它只是“运气”。如果您有一个更准确的计时器,它可能会根据读取计时器或其他一些时间显示差异。
$ ./a.out
k = 1000000
k = 1000000
Simple: 0
Double: 1000
$ ./a.out
k = 1000000
k = 1000000
Simple: 1000
Double: 0
$ ./a.out
k = 1000000
k = 1000000
Simple: 1000
Double: 0
$ ./a.out
k = 1000000
k = 1000000
Simple: 1000
Double: 0
很容易看出BOTH循环已经过优化:
main:
.LFB1474:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
subq $8, %rsp
.cfi_def_cfa_offset 32
call _ZNSt6chrono12system_clock3nowEv
movq %rax, %rbx
call _ZNSt6chrono12system_clock3nowEv
movl $.LC0, %esi
**subq %rbx, %rax**
movl $_ZSt4cout, %edi
imulq $1000, %rax, %rbp
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
**movl $1000000, %esi**
movq %rax, %rdi
call _ZNSo9_M_insertImEERSoT_
movq %rax, %rdi
call _ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_
call _ZNSt6chrono12system_clock3nowEv
movq %rax, %rbx
call _ZNSt6chrono12system_clock3nowEv
movl $.LC0, %esi
**subq %rbx, %rax**
movl $_ZSt4cout, %edi
imulq $1000, %rax, %rbx
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
**movl $1000000, %esi**
movq %rax, %rdi
call _ZNSo9_M_insertImEERSoT_
movq %rax, %rdi
call _ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_
movl $.LC1, %esi
movl $_ZSt4cout, %edi
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movq %rbp, %rsi
movq %rax, %rdi
call _ZNSo9_M_insertIlEERSoT_
movq %rax, %rdi
call _ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_
movl $.LC2, %esi
movl $_ZSt4cout, %edi
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movq %rbx, %rsi
movq %rax, %rdi
call _ZNSo9_M_insertIlEERSoT_
movq %rax, %rdi
call _ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_
addq $8, %rsp
.cfi_def_cfa_offset 24
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
ret
您可以清楚地看到K作为常量插入到流中的常量,并且“之前”和“之后”的时间被采用,然后在没有(很多)代码的情况下减去。 (“有趣”位用** ... **
标记 - 当然不会在代码中使它变粗“