下面的代码片段演示了一种情况,即以两种不同的方式调用CRC32编译器固有的 7字节数据(例如 case0()&amp; < em> case1())导致不同的编译器优化。编译器优化中的这些差异产生了截然不同的执行时间(例如[Test_Construction,Case:0,Bytes:7])。
作为参考,我已经包含了以相同的方式在6字节数据上调用CRC32的逻辑。但是,正如您从生成的输出中看到的那样,生成的执行时间与使用7个字节的数据时遇到的性能不同。
生成单次传递的输出 - 针对每种感兴趣的数据大小(6&amp; 7字节)进行4次独特测试:
Test_Construction <Case: 0, Bytes: 7>: 139.5543 ms
Test_Construction <Case: 1, Bytes: 7>: 38.6545 ms
Test_Reference <Case: 0, Bytes: 7>: 26.2616 ms
Test_Reference <Case: 1, Bytes: 7>: 38.8118 ms
Test_Construction <Case: 0, Bytes: 6>: 26.2925 ms
Test_Construction <Case: 1, Bytes: 6>: 29.5819 ms
Test_Reference <Case: 0, Bytes: 6>: 25.3754 ms
Test_Reference <Case: 1, Bytes: 6>: 28.7829 ms
我有两个问题:
- 为什么编译器会产生不同的优化 (例如,特别是在[Test_Construction,Case:0,Bytes:7]的情况下?
- 看起来当[Test_Construction,Case:0,Bytes:7]被翻译成机器代码时,它包含附加指令,这些指令将数据从堆栈移动到寄存器中,然后在堆栈中退出。在任何其他情况下似乎都不会发生这种情况。然后在寄存器中找到的数据上调用CRC一次,在堆栈上的数据上调用一次CRC。为什么要这样做?
- 为什么性能首先下降?
醇>
- 是否是由于[Test_Construction,Case:0,Bytes:7]机器代码中的额外堆栈逻辑(内存操作)造成的?
- 操作的顺序是否有贡献?
- 有没有办法阻止优化器生成这个次优的机器代码?
更新1 - 4/7/17:
template<int N>
void MemCpy(char* szDst, const char* szSrc) {
memcpy(szDst, szSrc, N);
}
// I tried both of these alternatives to memcpy, no luck.
template<> void MemCpy<7>(char* szDst, const char* szSrc) {
//AS4(szDst) = AS4(szSrc), AS2(szDst+4) = AS2(szSrc+4), AS1(szDst+6) = AS1(szSrc+6);
AS4(szDst) = AS4(szSrc), AS4(szDst+3) = AS4(szSrc+3);
}
环境详情:
Windows Server 2012 R2 x64
Intel Xeon X5670
大会参考:
-------------------------------------------------------
Test_Construction <Case: 0, Bytes: 7>: 139.5543 ms
-------------------------------------------------------
00007FF62D7911CC call CBench::CBench (07FF62D791000h)
00007FF62D7911D1 xor r8d,r8d
00007FF62D7911D4 lea r10,[_a (07FF62D794630h)]
00007FF62D7911DB mov r9d,1312D00h
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7911E1 mov rax,r8
00007FF62D7911E4 inc r8
00007FF62D7911E7 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7911EC lea rcx,[rax+rax*2]
00007FF62D7911F0 movzx eax,word ptr [r10+rcx*8+4]
00007FF62D7911F6 mov edx,dword ptr [r10+rcx*8]
00007FF62D7911FA mov word ptr [rsp+44h],ax
00007FF62D7911FF movzx eax,byte ptr [r10+rcx*8+6]
00007FF62D791205 mov byte ptr [rsp+46h],al
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791209 mov eax,7
00007FF62D79120E crc32 eax,edx
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D791213 mov dword ptr [buf],edx
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791217 crc32 eax,dword ptr [rsp+43h]
00007FF62D79121E add ebx,eax
00007FF62D791220 sub r9,1
00007FF62D791224 jne Test_Func<0,7,0>+71h (07FF62D7911E1h)
}
return ii;
00007FF62D791226 lea rcx,[Bench]
00007FF62D79122B call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Construction <Case: 1, Bytes: 7>: 38.6545 ms
-------------------------------------------------------
00007FF62D7912A9 call CBench::CBench (07FF62D791000h)
00007FF62D7912AE xor r8d,r8d
00007FF62D7912B1 lea r10,[_a (07FF62D794630h)]
00007FF62D7912B8 mov r9d,1312D00h
00007FF62D7912BE xchg ax,ax
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7912C0 mov rax,r8
00007FF62D7912C3 inc r8
00007FF62D7912C6 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7912CB lea rcx,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7912CF movzx eax,word ptr [r10+rcx*8+4]
00007FF62D7912D5 movzx edx,byte ptr [r10+rcx*8+6]
00007FF62D7912DB shl rdx,10h
00007FF62D7912DF or rdx,rax
00007FF62D7912E2 mov eax,dword ptr [r10+rcx*8]
00007FF62D7912E6 shl rdx,20h
00007FF62D7912EA or rdx,rax
00007FF62D7912ED mov eax,7
00007FF62D7912F2 crc32 rax,rdx
00007FF62D7912F8 add ebx,eax
00007FF62D7912FA sub r9,1
00007FF62D7912FE jne Test_Func<1,7,0>+70h (07FF62D7912C0h)
}
return ii;
00007FF62D791300 lea rcx,[Bench]
00007FF62D791305 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 0, Bytes: 7>: 26.2616 ms
-------------------------------------------------------
00007FF62D791386 call CBench::CBench (07FF62D791000h)
00007FF62D79138B xor edx,edx
00007FF62D79138D lea r9,[_a (07FF62D794630h)]
00007FF62D791394 mov r8d,1312D00h
00007FF62D79139A nop word ptr [rax+rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7913A0 mov rax,rdx
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7913A3 mov ecx,7
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7913A8 and eax,3FFh
00007FF62D7913AD inc rdx
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7913B0 lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7913B4 crc32 ecx,dword ptr [r9+rax*8]
00007FF62D7913BB crc32 ecx,dword ptr [r9+rax*8+3]
00007FF62D7913C3 add ebx,ecx
00007FF62D7913C5 sub r8,1
00007FF62D7913C9 jne Test_Func<0,7,1>+70h (07FF62D7913A0h)
}
return ii;
00007FF62D7913CB lea rcx,[Bench]
00007FF62D7913D0 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 1, Bytes: 7>: 38.8118 ms
-------------------------------------------------------
00007FF62D791449 call CBench::CBench (07FF62D791000h)
00007FF62D79144E xor r8d,r8d
00007FF62D791451 lea r10,[_a (07FF62D794630h)]
00007FF62D791458 mov r9d,1312D00h
00007FF62D79145E xchg ax,ax
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791460 mov rax,r8
00007FF62D791463 inc r8
00007FF62D791466 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79146B lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79146F movzx edx,byte ptr [r10+rax*8+6]
00007FF62D791475 lea rcx,[r10+rax*8]
00007FF62D791479 movzx eax,word ptr [r10+rax*8+4]
00007FF62D79147F shl rdx,10h
00007FF62D791483 or rdx,rax
00007FF62D791486 mov eax,dword ptr [rcx]
00007FF62D791488 shl rdx,20h
00007FF62D79148C or rdx,rax
00007FF62D79148F mov eax,7
00007FF62D791494 crc32 rax,rdx
00007FF62D79149A add ebx,eax
00007FF62D79149C sub r9,1
00007FF62D7914A0 jne Test_Func<1,7,1>+70h (07FF62D791460h)
}
return ii;
00007FF62D7914A2 lea rcx,[Bench]
00007FF62D7914A7 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Construction <Case: 0, Bytes: 6>: 26.2925 ms
-------------------------------------------------------
00007FF62D791526 call CBench::CBench (07FF62D791000h)
00007FF62D79152B xor r8d,r8d
00007FF62D79152E lea r10,[_a (07FF62D794630h)]
00007FF62D791535 mov r9d,1312D00h
00007FF62D79153B nop dword ptr [rax+rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791540 mov rax,r8
00007FF62D791543 inc r8
00007FF62D791546 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79154B lea rcx,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79154F mov eax,6
00007FF62D791554 crc32 eax,dword ptr [r10+rcx*8]
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79155B movzx edx,word ptr [r10+rcx*8+4]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791561 crc32 eax,dx
00007FF62D791567 add ebx,eax
00007FF62D791569 sub r9,1
00007FF62D79156D jne Test_Func<0,6,0>+70h (07FF62D791540h)
}
return ii;
00007FF62D79156F lea rcx,[Bench]
00007FF62D791574 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Construction <Case: 1, Bytes: 6>: 29.5819 ms
-------------------------------------------------------
00007FF62D7915F9 call CBench::CBench (07FF62D791000h)
00007FF62D7915FE xor r8d,r8d
00007FF62D791601 lea r10,[_a (07FF62D794630h)]
00007FF62D791608 mov r9d,1312D00h
00007FF62D79160E xchg ax,ax
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D791610 mov rax,r8
00007FF62D791613 inc r8
00007FF62D791616 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D79161B lea rcx,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D79161F mov eax,dword ptr [r10+rcx*8]
00007FF62D791623 movzx edx,word ptr [r10+rcx*8+4]
00007FF62D791629 shl rdx,20h
00007FF62D79162D or rdx,rax
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D791630 mov eax,6
00007FF62D791635 crc32 rax,rdx
00007FF62D79163B add ebx,eax
00007FF62D79163D sub r9,1
00007FF62D791641 jne Test_Func<1,6,0>+70h (07FF62D791610h)
}
return ii;
00007FF62D791643 lea rcx,[Bench]
00007FF62D791648 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 0, Bytes: 6>: 25.3754 ms
-------------------------------------------------------
00007FF62D7916C6 call CBench::CBench (07FF62D791000h)
00007FF62D7916CB xor edx,edx
00007FF62D7916CD lea r9,[_a (07FF62D794630h)]
00007FF62D7916D4 mov r8d,1312D00h
00007FF62D7916DA nop word ptr [rax+rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7916E0 mov rax,rdx
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916E3 mov ecx,6
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7916E8 and eax,3FFh
00007FF62D7916ED inc rdx
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7916F0 lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916F4 crc32 ecx,dword ptr [r9+rax*8]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7916FB crc32 ecx,word ptr [r9+rax*8+4]
00007FF62D791704 add ebx,ecx
00007FF62D791706 sub r8,1
00007FF62D79170A jne Test_Func<0,6,1>+70h (07FF62D7916E0h)
}
return ii;
00007FF62D79170C lea rcx,[Bench]
00007FF62D791711 call CBench::~CBench (07FF62D791030h)
-------------------------------------------------------
Test_Reference <Case: 1, Bytes: 6>: 28.7829 ms
-------------------------------------------------------
00007FF62D791799 call CBench::CBench (07FF62D791000h)
00007FF62D79179E xor edx,edx
00007FF62D7917A0 lea r9,[_a (07FF62D794630h)]
00007FF62D7917A7 mov r8d,1312D00h
00007FF62D7917AD nop dword ptr [rax]
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
00007FF62D7917B0 mov rax,rdx
00007FF62D7917B3 inc rdx
00007FF62D7917B6 and eax,3FFh
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
00007FF62D7917BB lea rax,[rax+rax*2]
ii += (CASE == 1) ? x.Case1() : x.Case0();
00007FF62D7917BF movzx ecx,word ptr [r9+rax*8+4]
00007FF62D7917C5 mov eax,dword ptr [r9+rax*8]
00007FF62D7917C9 shl rcx,20h
00007FF62D7917CD or rcx,rax
00007FF62D7917D0 mov eax,6
00007FF62D7917D5 crc32 rax,rcx
00007FF62D7917DB add ebx,eax
00007FF62D7917DD sub r8,1
00007FF62D7917E1 jne Test_Func<1,6,1>+70h (07FF62D7917B0h)
}
return ii;
00007FF62D7917E3 lea rcx,[Bench]
00007FF62D7917E8 call CBench::~CBench (07FF62D791030h)
源代码:
#include <Windows.h>
#include "new"
#include <cstdio>
#include <intrin.h>
#define DimensionOf(x) (sizeof(x)/sizeof(*(x)))
#define INL __forceinline
#define NOINL __declspec(noinline)
#define PASSES 20000000
#define AS1(a_) (*(U1*)(a_))
#define AS2(a_) (*(U2*)(a_))
#define AS3(a_) ((U4(AS1((char*)(a_) + 2))<<16) | AS2(a_))
#define AS4(a_) (*(U4*)(a_))
#define AS6(a_) ((U8(AS2((char*)(a_) + 4))<<32) | AS4(a_))
#define AS7(a_) ((U8(AS3((char*)(a_) + 4))<<32) | AS4(a_))
typedef unsigned char U1;
typedef unsigned short U2;
typedef unsigned int U4;
typedef unsigned long long U8;
typedef char TData[24];
TData _a[0x400];
// CBench is for benchmarking code
class CBench {
__int64 m_nStart;
const char* m_desc;
public:
// No inline declared
// Reasoning: Simplifies the assembly code.
// Easier to see how the optimizer optimizes different variations of an algorithm.
NOINL CBench(const char *szDesc)
: m_desc(szDesc), m_nStart(GetBenchMark()) { }
NOINL ~CBench() {
__int64 cpuFreq, deltaTime(GetBenchMark() - m_nStart);
QueryPerformanceFrequency((LARGE_INTEGER*) &cpuFreq);
double execTimeInMS = ((double) deltaTime * 1000) / cpuFreq;
printf("%s:\t%10.4f ms\n", m_desc, execTimeInMS);
}
NOINL static __int64 GetBenchMark(void) {
__int64 nBenchMark;
QueryPerformanceCounter((LARGE_INTEGER*) &nBenchMark);
return nBenchMark;
}
};
// CFunc executes CRC32 intrinsics on 6 & 7 bytes in two different ways
template <int N>
struct CFunc {
char m_ach[N];
INL CFunc(const char* sz) {
memcpy(m_ach, sz, N);
}
INL U4 Case0() {
return (N == 7) ? _mm_crc32_u32(_mm_crc32_u32(N, AS4(m_ach)), AS4(m_ach + 3))
: _mm_crc32_u16(_mm_crc32_u32(N, AS4(m_ach)), AS2(m_ach + 4));
}
INL U4 Case1() {
return (N == 7) ? (U4) _mm_crc32_u64(N, AS7(m_ach))
: (U4) _mm_crc32_u64(N, AS6(m_ach));
}
};
// Evaluates performance dependent on:
// - CASE : CRC procedure
// - N : Number of bytes
// - USEREF : True, reference to pre-existing CFunc object
// False, constructing new CFunc object
template<U4 CASE, int N, bool USEREF>
NOINL int Test_Func(int ii) {
char szDesc[64], buf[64];
(USEREF) ? sprintf(szDesc, "%-18s<Case: %d, Bytes: %d>", "Test_Reference", CASE, N)
: sprintf(szDesc, "%-18s<Case: %d, Bytes: %d>", "Test_Construction", CASE, N);
CBench Bench(szDesc);
for (int iPass = 0; iPass < PASSES; ++iPass) {
int i = iPass & (DimensionOf(_a) - 1);
auto& x = (USEREF) ? *(CFunc<N>*)_a[i] : *new(buf) CFunc<N>(_a[i]);
ii += (CASE == 1) ? x.Case1() : x.Case0();
}
return ii;
}
int main(int argc, char* argv[]) {
for (int i = 0; i < 10; ++i) {
printf("\n>>>>\tPass %d:\n", i);
// Execute CRC on 7 bytes
// Construct CFunc Object
argc = Test_Func<0, 7, false>(argc);
argc = Test_Func<1, 7, false>(argc);
// Reference pre-existing CFunc Object
argc = Test_Func<0, 7, true>(argc);
argc = Test_Func<1, 7, true>(argc);
// Execute CRC on 6 bytes
// Construct CFunc Object
argc = Test_Func<0, 6, false>(argc);
argc = Test_Func<1, 6, false>(argc);
// Reference pre-existing CFunc Object
argc = Test_Func<0, 6, true>(argc);
argc = Test_Func<1, 6, true>(argc);
}
printf("\n\nDone\n");
return argc;
}
答案 0 :(得分:2)
编译器用于将数据复制到7字节缓冲区的操作填充寄存器的方式与crc32调用要求不同。编译器必须转到堆栈以获取crc32调用所需的寄存器。没有1,2,4字节读写的组合,不需要对堆栈进行完全写入。当我将7个字节复制到 8字节缓冲区时,用第二个未对齐的4字节mov复制中间字节,编译器看到已经为crc32调用填充了2个寄存器并消除了堆栈读/写。
125.997 ms: 使用memcpy,它执行对齐复制,以及未对齐的crc32:
memcpy(buf, _a[i], 7);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 3));
movzx eax,word ptr [_a[i]+4]
mov edx,dword ptr [_a[i]]
mov word ptr [buf+4],ax
movzx eax,byte ptr [_a[i]+6]
mov byte ptr [buf+6],al
xor eax,eax
crc32 eax,edx
mov dword ptr [buf],edx
crc32 eax,dword ptr [buf+3]
第一次调用crc32可以使用副本中的寄存器edx,但第二次调用没有准备好寄存器。它需要将DWORD,WORD和BYTE的结果移动到buf中。除此之外,我怀疑编译器在这里看到一堆别名并且保守。编译器别无选择,只能在堆栈上构建buf然后访问它。
137.044 ms: memcpy&lt; 7&gt ;,未对齐重叠的副本到7个char buf,遇到同样的问题。复制步骤中涉及的寄存器不是crc32步骤所需的寄存器。它有一些更多的未对齐访问,所以它减慢了一点:
AS4(buf) = AS4(_a[i]), AS4(buf + 3) = AS4(_a[i] + 3);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 3));
mov eax,dword ptr [_a[i]]
mov ecx,dword ptr [_a[i]+3]
mov dword ptr [buf],eax
xor eax,eax
mov dword ptr [buf+3],ecx
crc32 eax,dword ptr [buf]
crc32 eax,ecx
16.733 ms: 未对齐的重叠访问源但不重叠到8字节的目标buf,看到了巨大的改进!在这种情况下,我们将中间字节复制两次,但我们永远不会在buf中为DWORDS添加别名。如果_a [i] =“1234567”,则buf将为“12344567”:
AS4(buf) = AS4(_a[i]), AS4(buf + 4) = AS4(_a[i] + 3);
ii += _mm_crc32_u32(_mm_crc32_u32(0, AS4(buf)), AS4(buf + 4));
xor eax,eax
crc32 eax,dword ptr [_a[i]]
crc32 eax,dword ptr [_a[i]+3]
将第一个DWORD复制到buf中的调用和将第二个DWORD复制到buf + 4的调用使用2个单独的寄存器,这些寄存器可以直接传递给crc32,因此不需要使用buf。后续传递中的优化器会通知移动到堆栈的未使用数据并删除相关操作。
121.500 ms: 然后我尝试使用与上面相同的方式在8 char buf上运行64位crc并且丢失了很大。编译器没有使用单个8字节寄存器来移动到buf。
AS4(buf) = AS4(_a[i]), AS4(buf + 4) = AS4(_a[i] + 3);
ii += _mm_crc32_u64(0, AS8(buf));
mov eax,dword ptr [_a[i]]
mov dword ptr [buf],eax
mov eax,dword ptr [_a[i]+3]
mov dword ptr [buf+3],eax
xor eax,eax
crc32 rax,qword ptr [buf]
20.799 ms: 我将移动到buf改为8字节而不是2 x 4字节。这停止使用堆栈,但仍然落后于上面的第三种方法:
AS8(buf) = AS4(_a[i]) | ((U8)AS4(_a[i] + 3) << 32);
ii += _mm_crc32_u64(0, AS8(buf));
mov ecx,dword ptr [_a[i]+3]
mov eax,dword ptr [_a[i]]
shl rcx,20h
or rcx,rax
xor eax,eax
crc32 rax,rcx
1:125.997 ms 2拍摄:137.044 ms 3拍摄:16.733毫秒 4拍摄:121.500毫秒 5拍摄:20.799 ms