Question

下面的代码片段演示了一种情况，即以两种不同的方式调用CRC32编译器固有的 7字节数据（例如 case0（）＆amp; < em> case1（））导致不同的编译器优化。编译器优化中的这些差异产生了截然不同的执行时间（例如[Test_Construction，Case：0，Bytes：7]）。

作为参考，我已经包含了以相同的方式在6字节数据上调用CRC32的逻辑。但是，正如您从生成的输出中看到的那样，生成的执行时间与使用7个字节的数据时遇到的性能不同。

生成单次传递的输出 - 针对每种感兴趣的数据大小（6＆amp; 7字节）进行4次独特测试：

Test_Construction <Case: 0, Bytes: 7>:    139.5543 ms
Test_Construction <Case: 1, Bytes: 7>:     38.6545 ms
Test_Reference    <Case: 0, Bytes: 7>:     26.2616 ms
Test_Reference    <Case: 1, Bytes: 7>:     38.8118 ms
Test_Construction <Case: 0, Bytes: 6>:     26.2925 ms
Test_Construction <Case: 1, Bytes: 6>:     29.5819 ms
Test_Reference    <Case: 0, Bytes: 6>:     25.3754 ms
Test_Reference    <Case: 1, Bytes: 6>:     28.7829 ms

我有两个问题：

为什么编译器会产生不同的优化   （例如，特别是在[Test_Construction，Case：0，Bytes：7]的情况下？

看起来当[Test_Construction，Case：0，Bytes：7]被翻译成机器代码时，它包含附加指令，这些指令将数据从堆栈移动到寄存器中，然后在堆栈中退出。在任何其他情况下似乎都不会发生这种情况。然后在寄存器中找到的数据上调用CRC一次，在堆栈上的数据上调用一次CRC。为什么要这样做？



为什么性能首先下降？

是否是由于[Test_Construction，Case：0，Bytes：7]机器代码中的额外堆栈逻辑（内存操作）造成的？

操作的顺序是否有贡献？

有没有办法阻止优化器生成这个次优的机器代码？

更新1 - 4/7/17：

@ 1201ProgramAlarm，johnnycrash
- 我只是想澄清一下，我想优化/减少生成的机器代码。我故意重叠[Case：0，Bytes：7]中的第4个字节，以便调用CRC32_u32两次，以避免必须进行以下3次调用：CRC32_u32 + CRC32_u16 + CRC32_u8。
- 作为对你的建议的跟进，johnnycrash，我试图在CFunc的构造函数中完全删除对memcpy的调用，特别是在数据大小为7字节的情况下。请参阅下面的代码。但是，这对执行时间没有影响。

template<int N>
void MemCpy(char* szDst, const char* szSrc) {
    memcpy(szDst, szSrc, N);
}

// I tried both of these alternatives to memcpy, no luck.
template<> void MemCpy<7>(char* szDst, const char* szSrc) {
    //AS4(szDst) = AS4(szSrc), AS2(szDst+4) = AS2(szSrc+4),  AS1(szDst+6) = AS1(szSrc+6);
    AS4(szDst) = AS4(szSrc), AS4(szDst+3) = AS4(szSrc+3);
}

环境详情：

Windows Server 2012 R2 x64
Intel Xeon X5670

大会参考：

-------------------------------------------------------
Test_Construction <Case: 0, Bytes: 7>:    139.5543 ms
-------------------------------------------------------
00007FF62D7911CC  call        CBench::CBench (07FF62D791000h)  
00007FF62D7911D1  xor         r8d,r8d  
00007FF62D7911D4  lea         r10,[_a (07FF62D794630h)]  
00007FF62D7911DB  mov         r9d,1312D00h  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7911E1  mov         rax,r8  
00007FF62D7911E4  inc         r8  
00007FF62D7911E7  and         eax,3FFh  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7911EC  lea         rcx,[rax+rax*2]  
00007FF62D7911F0  movzx       eax,word ptr [r10+rcx*8+4]  
00007FF62D7911F6  mov         edx,dword ptr [r10+rcx*8]  
00007FF62D7911FA  mov         word ptr [rsp+44h],ax  
00007FF62D7911FF  movzx       eax,byte ptr [r10+rcx*8+6]  
00007FF62D791205  mov         byte ptr [rsp+46h],al  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791209  mov         eax,7  
00007FF62D79120E  crc32       eax,edx  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D791213  mov         dword ptr [buf],edx  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791217  crc32       eax,dword ptr [rsp+43h]  
00007FF62D79121E  add         ebx,eax  
00007FF62D791220  sub         r9,1  
00007FF62D791224  jne         Test_Func<0,7,0>+71h (07FF62D7911E1h)  
                }
                return ii;
00007FF62D791226  lea         rcx,[Bench]  
00007FF62D79122B  call        CBench::~CBench (07FF62D791030h)


-------------------------------------------------------
Test_Construction <Case: 1, Bytes: 7>:     38.6545 ms
-------------------------------------------------------
00007FF62D7912A9  call        CBench::CBench (07FF62D791000h)  
00007FF62D7912AE  xor         r8d,r8d  
00007FF62D7912B1  lea         r10,[_a (07FF62D794630h)]  
00007FF62D7912B8  mov         r9d,1312D00h  
00007FF62D7912BE  xchg        ax,ax  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7912C0  mov         rax,r8  
00007FF62D7912C3  inc         r8  
00007FF62D7912C6  and         eax,3FFh  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7912CB  lea         rcx,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7912CF  movzx       eax,word ptr [r10+rcx*8+4]  
00007FF62D7912D5  movzx       edx,byte ptr [r10+rcx*8+6]  
00007FF62D7912DB  shl         rdx,10h  
00007FF62D7912DF  or          rdx,rax  
00007FF62D7912E2  mov         eax,dword ptr [r10+rcx*8]  
00007FF62D7912E6  shl         rdx,20h  
00007FF62D7912EA  or          rdx,rax  
00007FF62D7912ED  mov         eax,7  
00007FF62D7912F2  crc32       rax,rdx  
00007FF62D7912F8  add         ebx,eax  
00007FF62D7912FA  sub         r9,1  
00007FF62D7912FE  jne         Test_Func<1,7,0>+70h (07FF62D7912C0h)  
                }
                return ii;
00007FF62D791300  lea         rcx,[Bench]  
00007FF62D791305  call        CBench::~CBench (07FF62D791030h)


-------------------------------------------------------
Test_Reference    <Case: 0, Bytes: 7>:     26.2616 ms
-------------------------------------------------------
00007FF62D791386  call        CBench::CBench (07FF62D791000h)  
00007FF62D79138B  xor         edx,edx  
00007FF62D79138D  lea         r9,[_a (07FF62D794630h)]  
00007FF62D791394  mov         r8d,1312D00h  
00007FF62D79139A  nop         word ptr [rax+rax]  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7913A0  mov         rax,rdx  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7913A3  mov         ecx,7  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7913A8  and         eax,3FFh  
00007FF62D7913AD  inc         rdx  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7913B0  lea         rax,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7913B4  crc32       ecx,dword ptr [r9+rax*8]  
00007FF62D7913BB  crc32       ecx,dword ptr [r9+rax*8+3]  
00007FF62D7913C3  add         ebx,ecx  
00007FF62D7913C5  sub         r8,1  
00007FF62D7913C9  jne         Test_Func<0,7,1>+70h (07FF62D7913A0h)  
                }
                return ii;
00007FF62D7913CB  lea         rcx,[Bench]  
00007FF62D7913D0  call        CBench::~CBench (07FF62D791030h)  


-------------------------------------------------------    
Test_Reference    <Case: 1, Bytes: 7>:     38.8118 ms
-------------------------------------------------------
00007FF62D791449  call        CBench::CBench (07FF62D791000h)  
00007FF62D79144E  xor         r8d,r8d  
00007FF62D791451  lea         r10,[_a (07FF62D794630h)]  
00007FF62D791458  mov         r9d,1312D00h  
00007FF62D79145E  xchg        ax,ax  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791460  mov         rax,r8  
00007FF62D791463  inc         r8  
00007FF62D791466  and         eax,3FFh  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79146B  lea         rax,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79146F  movzx       edx,byte ptr [r10+rax*8+6]  
00007FF62D791475  lea         rcx,[r10+rax*8]  
00007FF62D791479  movzx       eax,word ptr [r10+rax*8+4]  
00007FF62D79147F  shl         rdx,10h  
00007FF62D791483  or          rdx,rax  
00007FF62D791486  mov         eax,dword ptr [rcx]  
00007FF62D791488  shl         rdx,20h  
00007FF62D79148C  or          rdx,rax  
00007FF62D79148F  mov         eax,7  
00007FF62D791494  crc32       rax,rdx  
00007FF62D79149A  add         ebx,eax  
00007FF62D79149C  sub         r9,1  
00007FF62D7914A0  jne         Test_Func<1,7,1>+70h (07FF62D791460h)  
                }
                return ii;
00007FF62D7914A2  lea         rcx,[Bench]  
00007FF62D7914A7  call        CBench::~CBench (07FF62D791030h) 


-------------------------------------------------------
Test_Construction <Case: 0, Bytes: 6>:     26.2925 ms
-------------------------------------------------------
00007FF62D791526  call        CBench::CBench (07FF62D791000h)  
00007FF62D79152B  xor         r8d,r8d  
00007FF62D79152E  lea         r10,[_a (07FF62D794630h)]  
00007FF62D791535  mov         r9d,1312D00h  
00007FF62D79153B  nop         dword ptr [rax+rax]  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791540  mov         rax,r8  
00007FF62D791543  inc         r8  
00007FF62D791546  and         eax,3FFh  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79154B  lea         rcx,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79154F  mov         eax,6  
00007FF62D791554  crc32       eax,dword ptr [r10+rcx*8]  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79155B  movzx       edx,word ptr [r10+rcx*8+4]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791561  crc32       eax,dx  
00007FF62D791567  add         ebx,eax  
00007FF62D791569  sub         r9,1  
00007FF62D79156D  jne         Test_Func<0,6,0>+70h (07FF62D791540h)  
                }
                return ii;
00007FF62D79156F  lea         rcx,[Bench]  
00007FF62D791574  call        CBench::~CBench (07FF62D791030h)


-------------------------------------------------------
Test_Construction <Case: 1, Bytes: 6>:     29.5819 ms
-------------------------------------------------------
00007FF62D7915F9  call        CBench::CBench (07FF62D791000h)  
00007FF62D7915FE  xor         r8d,r8d  
00007FF62D791601  lea         r10,[_a (07FF62D794630h)]  
00007FF62D791608  mov         r9d,1312D00h  
00007FF62D79160E  xchg        ax,ax  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791610  mov         rax,r8  
00007FF62D791613  inc         r8  
00007FF62D791616  and         eax,3FFh  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79161B  lea         rcx,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79161F  mov         eax,dword ptr [r10+rcx*8]  
00007FF62D791623  movzx       edx,word ptr [r10+rcx*8+4]  
00007FF62D791629  shl         rdx,20h  
00007FF62D79162D  or          rdx,rax  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791630  mov         eax,6  
00007FF62D791635  crc32       rax,rdx  
00007FF62D79163B  add         ebx,eax  
00007FF62D79163D  sub         r9,1  
00007FF62D791641  jne         Test_Func<1,6,0>+70h (07FF62D791610h)  
                }
                return ii;
00007FF62D791643  lea         rcx,[Bench]  
00007FF62D791648  call        CBench::~CBench (07FF62D791030h) 


-------------------------------------------------------
Test_Reference    <Case: 0, Bytes: 6>:     25.3754 ms
-------------------------------------------------------
00007FF62D7916C6  call        CBench::CBench (07FF62D791000h)  
00007FF62D7916CB  xor         edx,edx  
00007FF62D7916CD  lea         r9,[_a (07FF62D794630h)]  
00007FF62D7916D4  mov         r8d,1312D00h  
00007FF62D7916DA  nop         word ptr [rax+rax]  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7916E0  mov         rax,rdx  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916E3  mov         ecx,6  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7916E8  and         eax,3FFh  
00007FF62D7916ED  inc         rdx  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7916F0  lea         rax,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916F4  crc32       ecx,dword ptr [r9+rax*8]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916FB  crc32       ecx,word ptr [r9+rax*8+4]  
00007FF62D791704  add         ebx,ecx  
00007FF62D791706  sub         r8,1  
00007FF62D79170A  jne         Test_Func<0,6,1>+70h (07FF62D7916E0h)  
                }
                return ii;
00007FF62D79170C  lea         rcx,[Bench]  
00007FF62D791711  call        CBench::~CBench (07FF62D791030h)


-------------------------------------------------------
Test_Reference    <Case: 1, Bytes: 6>:     28.7829 ms
-------------------------------------------------------
00007FF62D791799  call        CBench::CBench (07FF62D791000h)  
00007FF62D79179E  xor         edx,edx  
00007FF62D7917A0  lea         r9,[_a (07FF62D794630h)]  
00007FF62D7917A7  mov         r8d,1312D00h  
00007FF62D7917AD  nop         dword ptr [rax]  
                for (int iPass = 0; iPass < PASSES; ++iPass) {
                                int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7917B0  mov         rax,rdx  
00007FF62D7917B3  inc         rdx  
00007FF62D7917B6  and         eax,3FFh  
                                auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7917BB  lea         rax,[rax+rax*2]  
                                ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7917BF  movzx       ecx,word ptr [r9+rax*8+4]  
00007FF62D7917C5  mov         eax,dword ptr [r9+rax*8]  
00007FF62D7917C9  shl         rcx,20h  
00007FF62D7917CD  or          rcx,rax  
00007FF62D7917D0  mov         eax,6  
00007FF62D7917D5  crc32       rax,rcx  
00007FF62D7917DB  add         ebx,eax  
00007FF62D7917DD  sub         r8,1  
00007FF62D7917E1  jne         Test_Func<1,6,1>+70h (07FF62D7917B0h)  
                }
                return ii;
00007FF62D7917E3  lea         rcx,[Bench]  
00007FF62D7917E8  call        CBench::~CBench (07FF62D791030h)

源代码：

#include <Windows.h>
#include "new"
#include <cstdio>
#include <intrin.h>

#define DimensionOf(x)      (sizeof(x)/sizeof(*(x)))
#define INL                 __forceinline
#define NOINL               __declspec(noinline)
#define PASSES              20000000
#define AS1(a_)             (*(U1*)(a_))
#define AS2(a_)             (*(U2*)(a_))
#define AS3(a_)             ((U4(AS1((char*)(a_) + 2))<<16) | AS2(a_))
#define AS4(a_)             (*(U4*)(a_))
#define AS6(a_)             ((U8(AS2((char*)(a_) + 4))<<32) | AS4(a_))
#define AS7(a_)             ((U8(AS3((char*)(a_) + 4))<<32) | AS4(a_))

typedef unsigned char       U1;
typedef unsigned short      U2;
typedef unsigned int        U4;
typedef unsigned long long  U8;

typedef char TData[24];
TData _a[0x400];

// CBench is for benchmarking code
class CBench {
    __int64     m_nStart;
    const char* m_desc;
public:
    // No inline declared 
    // Reasoning:   Simplifies the assembly code. 
    //              Easier to see how the optimizer optimizes different variations of an algorithm.
    NOINL CBench(const char *szDesc) 
        : m_desc(szDesc), m_nStart(GetBenchMark()) { }

    NOINL ~CBench() {
        __int64 cpuFreq, deltaTime(GetBenchMark() - m_nStart);
        QueryPerformanceFrequency((LARGE_INTEGER*) &cpuFreq);
        double execTimeInMS = ((double) deltaTime * 1000) / cpuFreq;
        printf("%s:\t%10.4f ms\n", m_desc, execTimeInMS);
    }

    NOINL static __int64 GetBenchMark(void) {
        __int64 nBenchMark;
        QueryPerformanceCounter((LARGE_INTEGER*) &nBenchMark);
        return nBenchMark;
    }
};

// CFunc executes CRC32 intrinsics on 6 & 7 bytes in two different ways
template <int N>
struct CFunc {
    char m_ach[N];
    INL CFunc(const char* sz) {
        memcpy(m_ach, sz, N);
    }
    INL U4 Case0() {
        return (N == 7) ? _mm_crc32_u32(_mm_crc32_u32(N, AS4(m_ach)), AS4(m_ach + 3))
                        : _mm_crc32_u16(_mm_crc32_u32(N, AS4(m_ach)), AS2(m_ach + 4));
    }
    INL U4 Case1() {
        return (N == 7) ? (U4) _mm_crc32_u64(N, AS7(m_ach))
                        : (U4) _mm_crc32_u64(N, AS6(m_ach));
    }

};

// Evaluates performance dependent on:
//  -   CASE    :   CRC procedure
//  -   N       :   Number of bytes
//  -   USEREF  :   True,   reference to pre-existing CFunc object
//                  False,  constructing new CFunc object
template<U4 CASE, int N, bool USEREF>
NOINL int Test_Func(int ii) {
    char szDesc[64], buf[64];
    (USEREF) ? sprintf(szDesc, "%-18s<Case: %d, Bytes: %d>", "Test_Reference", CASE, N) 
             : sprintf(szDesc, "%-18s<Case: %d, Bytes: %d>", "Test_Construction", CASE, N);
    CBench Bench(szDesc);
    for (int iPass = 0; iPass < PASSES; ++iPass) {
        int i = iPass & (DimensionOf(_a) - 1);
        auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
        ii += (CASE == 1) ? x.Case1() : x.Case0();
    }
    return ii;
}

int main(int argc, char* argv[]) {
    for (int i = 0; i < 10; ++i) {
        printf("\n>>>>\tPass %d:\n", i);
        // Execute CRC on 7 bytes
        // Construct CFunc Object
        argc = Test_Func<0, 7, false>(argc);
        argc = Test_Func<1, 7, false>(argc);
        // Reference pre-existing CFunc Object
        argc = Test_Func<0, 7, true>(argc);
        argc = Test_Func<1, 7, true>(argc);

        // Execute CRC on 6 bytes
        // Construct CFunc Object
        argc = Test_Func<0, 6, false>(argc);
        argc = Test_Func<1, 6, false>(argc);
        // Reference pre-existing CFunc Object
        argc = Test_Func<0, 6, true>(argc);
        argc = Test_Func<1, 6, true>(argc);
    }
    printf("\n\nDone\n");
    return argc;
}

Answer 1

编译器用于将数据复制到7字节缓冲区的操作填充寄存器的方式与crc32调用要求不同。编译器必须转到堆栈以获取crc32调用所需的寄存器。没有1,2,4字节读写的组合，不需要对堆栈进行完全写入。当我将7个字节复制到 8字节缓冲区时，用第二个未对齐的4字节mov复制中间字节，编译器看到已经为crc32调用填充了2个寄存器并消除了堆栈读/写。

125.997 ms： 使用memcpy，它执行对齐复制，以及未对齐的crc32：

memcpy(buf, _a[i], 7);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 3));
    movzx       eax,word ptr [_a[i]+4]  
    mov         edx,dword ptr [_a[i]]  
    mov         word ptr [buf+4],ax  
    movzx       eax,byte ptr [_a[i]+6]  
    mov         byte ptr [buf+6],al  
    xor         eax,eax  
    crc32       eax,edx  
    mov         dword ptr [buf],edx  
    crc32       eax,dword ptr [buf+3]

第一次调用crc32可以使用副本中的寄存器edx，但第二次调用没有准备好寄存器。它需要将DWORD，WORD和BYTE的结果移动到buf中。除此之外，我怀疑编译器在这里看到一堆别名并且保守。编译器别无选择，只能在堆栈上构建buf然后访问它。

137.044 ms： memcpy＆lt; 7＆gt ;,未对齐重叠的副本到7个char buf，遇到同样的问题。复制步骤中涉及的寄存器不是crc32步骤所需的寄存器。它有一些更多的未对齐访问，所以它减慢了一点：

AS4(buf) = AS4(_a[i]), AS4(buf + 3) = AS4(_a[i] + 3);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 3));
    mov         eax,dword ptr [_a[i]]  
    mov         ecx,dword ptr [_a[i]+3]  
    mov         dword ptr [buf],eax  
    xor         eax,eax  
    mov         dword ptr [buf+3],ecx  
    crc32       eax,dword ptr [buf]  
    crc32       eax,ecx

16.733 ms： 未对齐的重叠访问源但不重叠到8字节的目标buf，看到了巨大的改进！在这种情况下，我们将中间字节复制两次，但我们永远不会在buf中为DWORDS添加别名。如果_a [i] =“1234567”，则buf将为“12344567”：

AS4(buf) = AS4(_a[i]), AS4(buf + 4) = AS4(_a[i] + 3);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 4));
    xor         eax,eax  
    crc32       eax,dword ptr [_a[i]]  
    crc32       eax,dword ptr [_a[i]+3]

将第一个DWORD复制到buf中的调用和将第二个DWORD复制到buf + 4的调用使用2个单独的寄存器，这些寄存器可以直接传递给crc32，因此不需要使用buf。后续传递中的优化器会通知移动到堆栈的未使用数据并删除相关操作。

121.500 ms： 然后我尝试使用与上面相同的方式在8 char buf上运行64位crc并且丢失了很大。编译器没有使用单个8字节寄存器来移动到buf。

AS4(buf) = AS4(_a[i]), AS4(buf + 4) = AS4(_a[i] + 3);
ii += _mm_crc32_u64(0, AS8(buf));
    mov         eax,dword ptr [_a[i]]  
    mov         dword ptr [buf],eax  
    mov         eax,dword ptr [_a[i]+3]  
    mov         dword ptr [buf+3],eax  
    xor         eax,eax  
    crc32       rax,qword ptr [buf]

20.799 ms： 我将移动到buf改为8字节而不是2 x 4字节。这停止使用堆栈，但仍然落后于上面的第三种方法：

AS8(buf) = AS4(_a[i]) | ((U8)AS4(_a[i] + 3) << 32);
ii += _mm_crc32_u64(0, AS8(buf));
    mov         ecx,dword ptr [_a[i]+3]  
    mov         eax,dword ptr [_a[i]]  
    shl         rcx,20h  
    or          rcx,rax  
    xor         eax,eax  
    crc32       rax,rcx

1：125.997 ms 2拍摄：137.044 ms 3拍摄：16.733毫秒 4拍摄：121.500毫秒 5拍摄：20.799 ms

为什么编译器以不同方式优化这些情况？

1 个答案: