C编译器循环展开澄清

时间:2016-10-14 11:53:45

标签: c msvcrt loop-unrolling

我无法理解MSVC编译器如何展开以下循环(对不起我对汇编语言的理解不好):

#define NUM_ITERATIONS (1000 * 1000 * 1000)
double dummySum = 0;

for (int x = 0; x < NUM_ITERATIONS; x++) {
    if (x & 1) 
       dummySum += x;
}

这是生成的程序集:

00007FF7B4511070  xorps       xmm1,xmm1  
        double dummySum = 0;
00007FF7B4511073  mov         ecx,2  
00007FF7B4511078  nop         dword ptr [rax+rax]  
        if (x & 1) 
00007FF7B4511080  lea         eax,[rcx-2]  
00007FF7B4511083  mov         r8d,eax  
00007FF7B4511086  and         r8d,1  
00007FF7B451108A  je          someTest+28h (07FF7B4511098h)  
            dummySum += x;
00007FF7B451108C  movd        xmm0,eax  
00007FF7B4511090  cvtdq2pd    xmm0,xmm0  
00007FF7B4511094  addsd       xmm1,xmm0  
        if (x & 1) 
00007FF7B4511098  lea         edx,[rcx-1]  
00007FF7B451109B  and         edx,1  
00007FF7B451109E  je          someTest+3Fh (07FF7B45110AFh)  
            dummySum += x;
00007FF7B45110A0  lea         eax,[rcx-1]  
00007FF7B45110A3  movd        xmm0,eax  
00007FF7B45110A7  cvtdq2pd    xmm0,xmm0  
00007FF7B45110AB  addsd       xmm1,xmm0  
00007FF7B45110AF  test        r8d,r8d  
        if (x & 1) 
00007FF7B45110B2  je          someTest+50h (07FF7B45110C0h)  
            dummySum += x;
00007FF7B45110B4  movd        xmm0,ecx  
00007FF7B45110B8  cvtdq2pd    xmm0,xmm0  
00007FF7B45110BC  addsd       xmm1,xmm0  
00007FF7B45110C0  test        edx,edx  
        if (x & 1) 
00007FF7B45110C2  je          someTest+63h (07FF7B45110D3h)  
            dummySum += x;
00007FF7B45110C4  lea         eax,[rcx+1]  
00007FF7B45110C7  movd        xmm0,eax  
00007FF7B45110CB  cvtdq2pd    xmm0,xmm0  
00007FF7B45110CF  addsd       xmm1,xmm0  
00007FF7B45110D3  test        r8d,r8d  
        if (x & 1) 
00007FF7B45110D6  je          someTest+77h (07FF7B45110E7h)  
            dummySum += x;
00007FF7B45110D8  lea         eax,[rcx+2]  
00007FF7B45110DB  movd        xmm0,eax  
00007FF7B45110DF  cvtdq2pd    xmm0,xmm0  
00007FF7B45110E3  addsd       xmm1,xmm0  
00007FF7B45110E7  test        edx,edx  
        if (x & 1) 
00007FF7B45110E9  je          someTest+8Ah (07FF7B45110FAh)  
            dummySum += x;
00007FF7B45110EB  lea         eax,[rcx+3]  
00007FF7B45110EE  movd        xmm0,eax  
00007FF7B45110F2  cvtdq2pd    xmm0,xmm0  
00007FF7B45110F6  addsd       xmm1,xmm0  
00007FF7B45110FA  test        r8d,r8d  
        if (x & 1) 
00007FF7B45110FD  je          someTest+9Eh (07FF7B451110Eh)  
            dummySum += x;
00007FF7B45110FF  lea         eax,[rcx+4]  
00007FF7B4511102  movd        xmm0,eax  
00007FF7B4511106  cvtdq2pd    xmm0,xmm0  
00007FF7B451110A  addsd       xmm1,xmm0  
00007FF7B451110E  test        edx,edx  
        if (x & 1) 
00007FF7B4511110  je          someTest+0B1h (07FF7B4511121h)  
            dummySum += x;
00007FF7B4511112  lea         eax,[rcx+5]  
00007FF7B4511115  movd        xmm0,eax  
00007FF7B4511119  cvtdq2pd    xmm0,xmm0  
00007FF7B451111D  addsd       xmm1,xmm0  
00007FF7B4511121  test        r8d,r8d  
        if (x & 1) 
00007FF7B4511124  je          someTest+0C5h (07FF7B4511135h)  
            dummySum += x;
00007FF7B4511126  lea         eax,[rcx+6]  
00007FF7B4511129  movd        xmm0,eax  
00007FF7B451112D  cvtdq2pd    xmm0,xmm0  
00007FF7B4511131  addsd       xmm1,xmm0  
00007FF7B4511135  test        edx,edx  
        if (x & 1) 
00007FF7B4511137  je          someTest+0D8h (07FF7B4511148h)  
            dummySum += x;
00007FF7B4511139  lea         eax,[rcx+7]  
00007FF7B451113C  movd        xmm0,eax  
00007FF7B4511140  cvtdq2pd    xmm0,xmm0  
00007FF7B4511144  addsd       xmm1,xmm0  

    for (int x = 0; x < NUM_ITERATIONS; x++) {
00007FF7B4511148  add         ecx,0Ah  
00007FF7B451114B  lea         eax,[rcx-2]  
00007FF7B451114E  cmp         eax,3B9ACA00h  
00007FF7B4511153  jl          someTest+10h (07FF7B4511080h)  
    }

我理解这一部分(循环的开始):

// if (x % 2 == 0) jump over the sumation

00007FF7B4511073  mov         ecx,2                          // ecx/rcx = 2
00007FF7B4511080  lea         eax,[rcx-2]                    // eax = rcx - 2
00007FF7B4511083  mov         r8d,eax                        // r8d = eax
00007FF7B4511086  and         r8d,1                          // r8x & 1
00007FF7B451108A  je          someTest+28h (07FF7B4511098h)  // jump if zero

// add double 

00007FF7B451108C  movd        xmm0,eax  
00007FF7B4511090  cvtdq2pd    xmm0,xmm0  
00007FF7B4511094  addsd       xmm1,xmm0  

但我不明白后续的跳转指令似乎会跳过下一条lea指令,如果我查看地址(假设发生了跳转) - 请注意我从上面的列表

中省略了跳转之间的说明
00007FF7B45110C0  test        edx,edx  
00007FF7B45110C2  je          someTest+63h (07FF7B45110D3h) 

... addresses in between omitted ...

00007FF7B45110D3  test        r8d,r8d  
00007FF7B45110D6  je          someTest+77h (07FF7B45110E7h)  

... addresses in between omitted ...

00007FF7B45110E7  test        edx,edx  
00007FF7B45110E9  je          someTest+8Ah (07FF7B45110FAh)  

... addresses in between omitted ...

00007FF7B45110FA  test        r8d,r8d  
00007FF7B45110FD  je          someTest+9Eh (07FF7B451110Eh)  

... addresses in between omitted ...

00007FF7B451110E  test        edx,edx  
00007FF7B4511110  je          someTest+0B1h (07FF7B4511121h)  

如果发生每次跳转,似乎只是交替test r8d,r8dtest edx,edx指令,而不加载下一个值。

我在这里错误地解释了什么?

1 个答案:

答案 0 :(得分:1)

好的,明白了,我一步一步地去拆卸了;编译器相当聪明。展开循环以每次迭代执行10次,并且安排这些指令以便r8dedx每次迭代只加载

lea         eax,[rcx-2]  
mov         r8d,eax  
and         r8d,1        // r8d is 0 here
...
lea         edx,[rcx-1]  
and         edx,1        // edx is 1 here

之后,这些寄存器在迭代的其余部分没有再次加载,因为编译器显然意识到& 1在每个奇数步骤上的计算结果为:

00007FF7B45110C0  test        edx,edx  // always 1
00007FF7B45110C2  je          someTest+63h (07FF7B45110D3h) 

... addresses in between omitted ...

00007FF7B45110D3  test        r8d,r8d  // always 0
00007FF7B45110D6  je          someTest+77h (07FF7B45110E7h)  

... addresses in between omitted ...

00007FF7B45110E7  test        edx,edx  // always 1
00007FF7B45110E9  je          someTest+8Ah (07FF7B45110FAh)  

... addresses in between omitted ...

00007FF7B45110FA  test        r8d,r8d  // always 0
00007FF7B45110FD  je          someTest+9Eh (07FF7B451110Eh)  

... addresses in between omitted ...

00007FF7B451110E  test        edx,edx  // always 1
00007FF7B4511110  je          someTest+0B1h (07FF7B4511121h)