是什么让编译器能够优化此代码:
inline U64 sKnights(U64 knights) {
U64 l1 = (knights >> 1) & NFileH;
U64 l2 = (knights >> 2) & NFileGH;
U64 r1 = (knights << 1) & NFileA;
U64 r2 = (knights << 2) & NFileAB;
U64 h1 = l1 | r1;
U64 h2 = l2 | r2;
return (h1 << 16) | (h1 >> 16) | (h2 << 8) | (h2 >> 8);
}
但不是这样:
inline U64 sLKnights(U64 knights)
{
return (((knights >> 1 & NFileH) | (knights << 1 & NFileA)) << 16) | (((knights >> 1 & NFileH) | (knights << 1 & NFileA)) >> 16) | (((knights >> 2 & NFileGH) | (knights << 2 & NFileAB)) << 8) | (((knights >> 2 & NFileGH) | (knights << 2 & NFileAB)) >> 8);
}
使用此代码衡量效果:
LARGE_INTEGER st, en, freq, dur;
unsigned long long sum=0;
dur.QuadPart = 0;
QueryPerformanceFrequency(&freq);
for (size_t k = 0; k < 10000; k++)
{
QueryPerformanceCounter(&st);
for (unsigned long long i = 1ull; i < 10000; i++)
X;
QueryPerformanceCounter(&en);
dur.QuadPart += en.QuadPart - st.QuadPart;
}
cout << sum << endl << (dur.QuadPart / 10000) * 1000000 / freq.QuadPart << endl;
结果如下: 方法#1:32 方法#2:43
第二种方法的装配比方法#1的装配长,使用的程序是Microsoft Visual studio 2014 CTP3