我在Godbolt上使用尾部呼叫优化(TCO)和各种优化级别。我有一个非常简单的功能(从here提起):
int factorial (int x, int y){
if (x==0)
return y;
else
return factorial(x-1,y*x);
}
如果我设置-O1
,那么一切都很好,并且我了解生成的程序集。但是,如果我设置-O2
,则会发生更多的事情,而且我不知道发生了什么。有人可以解释吗?
在-02
生成Assembly:
.LCPI0_0:
.long 0 # 0x0
.long 4294967295 # 0xffffffff
.long 4294967294 # 0xfffffffe
.long 4294967293 # 0xfffffffd
.LCPI0_1:
.zero 4
.long 1 # 0x1
.long 1 # 0x1
.long 1 # 0x1
.LCPI0_2:
.long 1 # 0x1
.long 1 # 0x1
.long 1 # 0x1
.long 1 # 0x1
.LCPI0_3:
.long 4294967292 # 0xfffffffc
.long 4294967292 # 0xfffffffc
.long 4294967292 # 0xfffffffc
.long 4294967292 # 0xfffffffc
.LCPI0_4:
.long 4294967288 # 0xfffffff8
.long 4294967288 # 0xfffffff8
.long 4294967288 # 0xfffffff8
.long 4294967288 # 0xfffffff8
.LCPI0_5:
.long 4294967284 # 0xfffffff4
.long 4294967284 # 0xfffffff4
.long 4294967284 # 0xfffffff4
.long 4294967284 # 0xfffffff4
.LCPI0_6:
.long 4294967280 # 0xfffffff0
.long 4294967280 # 0xfffffff0
.long 4294967280 # 0xfffffff0
.long 4294967280 # 0xfffffff0
.LCPI0_7:
.long 4294967276 # 0xffffffec
.long 4294967276 # 0xffffffec
.long 4294967276 # 0xffffffec
.long 4294967276 # 0xffffffec
.LCPI0_8:
.long 4294967272 # 0xffffffe8
.long 4294967272 # 0xffffffe8
.long 4294967272 # 0xffffffe8
.long 4294967272 # 0xffffffe8
.LCPI0_9:
.long 4294967268 # 0xffffffe4
.long 4294967268 # 0xffffffe4
.long 4294967268 # 0xffffffe4
.long 4294967268 # 0xffffffe4
.LCPI0_10:
.long 4294967264 # 0xffffffe0
.long 4294967264 # 0xffffffe0
.long 4294967264 # 0xffffffe0
.long 4294967264 # 0xffffffe0
factorial(int, int): # @factorial(int, int)
mov eax, esi
test edi, edi
je .LBB0_12
cmp edi, 8
jb .LBB0_11
mov ecx, edi
and ecx, -8
movd xmm0, edi
pshufd xmm6, xmm0, 0 # xmm6 = xmm0[0,0,0,0]
paddd xmm6, xmmword ptr [rip + .LCPI0_0]
movd xmm0, eax
movaps xmm1, xmmword ptr [rip + .LCPI0_1] # xmm1 = <u,1,1,1>
movss xmm1, xmm0 # xmm1 = xmm0[0],xmm1[1,2,3]
lea esi, [rcx - 8]
mov edx, esi
shr edx, 3
add edx, 1
mov eax, edx
and eax, 3
cmp esi, 24
jae .LBB0_4
movdqa xmm2, xmmword ptr [rip + .LCPI0_2] # xmm2 = [1,1,1,1]
test eax, eax
jne .LBB0_7
jmp .LBB0_9
.LBB0_4:
mov esi, 1
sub esi, edx
lea edx, [rax + rsi]
add edx, -1
movdqa xmm2, xmmword ptr [rip + .LCPI0_2] # xmm2 = [1,1,1,1]
movdqa xmm9, xmmword ptr [rip + .LCPI0_4] # xmm9 = [4294967288,4294967288,4294967288,4294967288]
movdqa xmm10, xmmword ptr [rip + .LCPI0_5] # xmm10 = [4294967284,4294967284,4294967284,4294967284]
movdqa xmm11, xmmword ptr [rip + .LCPI0_6] # xmm11 = [4294967280,4294967280,4294967280,4294967280]
movdqa xmm12, xmmword ptr [rip + .LCPI0_7] # xmm12 = [4294967276,4294967276,4294967276,4294967276]
movdqa xmm13, xmmword ptr [rip + .LCPI0_8] # xmm13 = [4294967272,4294967272,4294967272,4294967272]
movdqa xmm14, xmmword ptr [rip + .LCPI0_9] # xmm14 = [4294967268,4294967268,4294967268,4294967268]
movdqa xmm15, xmmword ptr [rip + .LCPI0_10] # xmm15 = [4294967264,4294967264,4294967264,4294967264]
.LBB0_5: # =>This Inner Loop Header: Depth=1
movdqa xmm0, xmm6
paddd xmm0, xmmword ptr [rip + .LCPI0_3]
pshufd xmm7, xmm6, 245 # xmm7 = xmm6[1,1,3,3]
pshufd xmm3, xmm1, 245 # xmm3 = xmm1[1,1,3,3]
pmuludq xmm3, xmm7
pmuludq xmm1, xmm6
pshufd xmm7, xmm2, 245 # xmm7 = xmm2[1,1,3,3]
pshufd xmm4, xmm0, 245 # xmm4 = xmm0[1,1,3,3]
pmuludq xmm4, xmm7
pmuludq xmm0, xmm2
movdqa xmm2, xmm6
paddd xmm2, xmm9
movdqa xmm7, xmm6
paddd xmm7, xmm10
pmuludq xmm1, xmm2
pshufd xmm2, xmm2, 245 # xmm2 = xmm2[1,1,3,3]
pmuludq xmm2, xmm3
pmuludq xmm0, xmm7
pshufd xmm3, xmm7, 245 # xmm3 = xmm7[1,1,3,3]
pmuludq xmm3, xmm4
movdqa xmm4, xmm6
paddd xmm4, xmm11
movdqa xmm7, xmm6
paddd xmm7, xmm12
pshufd xmm5, xmm4, 245 # xmm5 = xmm4[1,1,3,3]
pmuludq xmm5, xmm2
pmuludq xmm4, xmm1
pshufd xmm8, xmm7, 245 # xmm8 = xmm7[1,1,3,3]
pmuludq xmm8, xmm3
pmuludq xmm7, xmm0
movdqa xmm0, xmm6
paddd xmm0, xmm13
movdqa xmm3, xmm6
paddd xmm3, xmm14
pmuludq xmm4, xmm0
pshufd xmm1, xmm4, 232 # xmm1 = xmm4[0,2,2,3]
pshufd xmm0, xmm0, 245 # xmm0 = xmm0[1,1,3,3]
pmuludq xmm0, xmm5
pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3]
punpckldq xmm1, xmm0 # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
pmuludq xmm7, xmm3
pshufd xmm2, xmm7, 232 # xmm2 = xmm7[0,2,2,3]
pshufd xmm0, xmm3, 245 # xmm0 = xmm3[1,1,3,3]
pmuludq xmm0, xmm8
pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3]
punpckldq xmm2, xmm0 # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
paddd xmm6, xmm15
add edx, 4
jne .LBB0_5
test eax, eax
je .LBB0_9
.LBB0_7:
neg eax
movdqa xmm3, xmmword ptr [rip + .LCPI0_3] # xmm3 = [4294967292,4294967292,4294967292,4294967292]
movdqa xmm4, xmmword ptr [rip + .LCPI0_4] # xmm4 = [4294967288,4294967288,4294967288,4294967288]
.LBB0_8: # =>This Inner Loop Header: Depth=1
movdqa xmm0, xmm6
paddd xmm0, xmm3
pshufd xmm5, xmm1, 245 # xmm5 = xmm1[1,1,3,3]
pmuludq xmm1, xmm6
pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3]
pshufd xmm7, xmm6, 245 # xmm7 = xmm6[1,1,3,3]
pmuludq xmm7, xmm5
pshufd xmm5, xmm7, 232 # xmm5 = xmm7[0,2,2,3]
punpckldq xmm1, xmm5 # xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
pshufd xmm5, xmm2, 245 # xmm5 = xmm2[1,1,3,3]
pmuludq xmm2, xmm0
pshufd xmm2, xmm2, 232 # xmm2 = xmm2[0,2,2,3]
pshufd xmm0, xmm0, 245 # xmm0 = xmm0[1,1,3,3]
pmuludq xmm0, xmm5
pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3]
punpckldq xmm2, xmm0 # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
paddd xmm6, xmm4
inc eax
jne .LBB0_8
.LBB0_9:
pshufd xmm0, xmm2, 245 # xmm0 = xmm2[1,1,3,3]
pmuludq xmm2, xmm1
pshufd xmm2, xmm2, 232 # xmm2 = xmm2[0,2,2,3]
pshufd xmm1, xmm1, 245 # xmm1 = xmm1[1,1,3,3]
pmuludq xmm1, xmm0
pshufd xmm0, xmm1, 232 # xmm0 = xmm1[0,2,2,3]
punpckldq xmm2, xmm0 # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
pshufd xmm0, xmm2, 78 # xmm0 = xmm2[2,3,0,1]
pmuludq xmm0, xmm2
pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3]
pshufd xmm2, xmm1, 10 # xmm2 = xmm1[2,2,0,0]
pmuludq xmm2, xmm1
pshufd xmm1, xmm2, 232 # xmm1 = xmm2[0,2,2,3]
punpckldq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
pshufd xmm1, xmm0, 229 # xmm1 = xmm0[1,1,2,3]
pmuludq xmm1, xmm0
movd eax, xmm1
cmp ecx, edi
je .LBB0_12
sub edi, ecx
.LBB0_11: # =>This Inner Loop Header: Depth=1
imul eax, edi
add edi, -1
jne .LBB0_11
.LBB0_12:
ret
在-01
处生成的程序集:
factorial(int, int): # @factorial(int, int)
mov eax, esi
test edi, edi
je .LBB0_2
.LBB0_1: # =>This Inner Loop Header: Depth=1
imul eax, edi
add edi, -1
jne .LBB0_1
.LBB0_2:
ret