我有C ++代码处理来自一个1800个元素阵列的三个连续值。由ICC 14.0编译的代码比MSVC生成的代码慢约68%(1600对2700 CPU周期)。我不明白为什么。有人可以帮忙吗?即使我设置英特尔编译器-O3开关,它也不会改变时序。 CPU是Ivy Bridge。
#include <iostream>
int main(){
int data[1200];
//Dummy-populate data
for(int y=0; y<1200; y++){
data[y] = y/2 + 7;
}
int counter = 0;
//Just to repeat the test
while(counter < 10000){
int Accum = 0;
long long start = 0;
long long end = 0;
int p = 0;
start = __rdtsc();
while(p < 1200){
unsigned int level1 = data[p];
unsigned int factor = data[p + 1];
Accum += (level1 * factor);
p = p + 2;
}
end = __rdtsc();
std::cout << (end - start) << " " << Accum << std::endl;
counter++;
}
}
答案 0 :(得分:4)
ICC在这里很糟糕,因为它正在计算每个data[n]
访问权限mov edi,dword ptr [rsp+rax*4+44h]
的地址...所有运行时乘法都很昂贵。您应该能够通过重新编码来避免它,因此索引是常量(也可以使用*p_data++
三次,但这会引入可能对性能产生负面影响的排序问题)。
for (unsigned* p_data = &data[0], *p_end = data + 1800; p_data < p_end; p_data += 3)
{
unsigned level1 = p_data[0];
unsigned level2 = p_data[1];
unsigned factor = p_data[2];
Accum1 += level1 * factor;
Accum2 += level2 * factor;
}
答案 1 :(得分:1)
user997112,我测试了你的新代码(只有一个级别和累积),gcc和icc只有5%的差异-O3
选项(-march=native -mtune=native
可能对你有帮助)。我的Core 2 Q6600固定在2.4 GHz,gcc的最佳结果为1800,icc的结果为1900。
这是我的测试版本(rdtsc()
使用gnu asm重新定义,运行时保存在数组中,并且只打印最小(最佳)运行时:
$ cat my.cc
#include <iostream>
#if 1
// my cpu has no rdtscp, so use asm
inline unsigned long long rdtsc() __attribute__((always_inline));
inline unsigned long long rdtsc() {
unsigned int lo, hi;
asm volatile (
"cpuid \n"
"rdtsc"
: "=a"(lo), "=d"(hi) /* outputs */
: "a"(0) /* inputs */
: "%ebx", "%ecx"); /* clobbers*/
return ((unsigned long long)lo) | (((unsigned long long)hi) << 32);
}
#else
#define rdtsc __rdtsc
#endif
int main(){
int data[1200];
int dummy[10000];
int stats[10000];
//Dummy-populate data
for(int y=0; y<1200; y++){
data[y] = y/2 + 7;
}
for(int y=0; y<10000; y++){
stats[y]=0;
}
int counter = 0;
//Just to repeat the test
while(counter < 10000){
int Accum = 0;
long long start = 0;
long long end = 0;
int p = 0;
start = rdtsc();
while(p < 1200){
unsigned int level1 = data[p];
unsigned int factor = data[p + 1];
Accum += (level1 * factor);
p = p + 2;
}
end = rdtsc();
stats[counter]=(end - start);
dummy[counter]=Accum;
counter++;
}
int min=0xfffff;
for(int y=0; y<10000; y++) {
if(stats[y] < min) {
min = stats[y];
std::cout << min << std::endl;
std::cout << "accum " << dummy[y] << std::endl;
}
}
std::cout << min << std::endl;
}
使用icc 14和gcc 4.8编译为:
$ g++ my.cc -o mygccO3t -O3 -march=native -mtune=native
$ icc my.cc -o myiccO3t -O3 -march=native -mtune=native
结果(CPU频率更改在2.4 GHz时被禁用,核心由taskset
固定,由Linux PMU访问工具perf
测量):
$ taskset -c 3 perf stat -e cycles:u,instructions:u ./myiccO3t |tail -n 1
Performance counter stats for './myiccO3t':
23 875 260 cycles:u
28 866 440 instructions:u # 1,21 insns per cycle
0,011297567 seconds time elapsed
1899
$ taskset -c 3 perf stat -e cycles:u,instructions:u ./mygccO3t |tail -n 1
Performance counter stats for './mygccO3t':
22 389 238 cycles:u
43 551 129 instructions:u # 1,95 insns per cycle
0,010683920 seconds time elapsed
1800
因此,我们可以看到,gcc需要更多的指令来处理相同数量的数据,但它也能实现更好的IPC(每时钟指令)速率。
gcc中有内部循环的简单汇编代码:
4009b9: 45 31 c0 xor %r8d,%r8d
4009bc: 45 31 c9 xor %r9d,%r9d
4009bf: 90 nop
4009c0: 44 89 c8 mov %r9d,%eax
4009c3: 0f a2 cpuid
4009c5: 0f 31 rdtsc
4009c7: 49 89 d2 mov %rdx,%r10
4009ca: 89 c0 mov %eax,%eax
4009cc: 48 89 e2 mov %rsp,%rdx
4009cf: 49 c1 e2 20 shl $0x20,%r10
4009d3: 31 ff xor %edi,%edi
4009d5: 49 09 c2 or %rax,%r10
4009d8: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
4009df: 00
vvvv
4009e0: 8b 4a 04 mov 0x4(%rdx),%ecx
4009e3: 48 83 c2 08 add $0x8,%rdx
4009e7: 0f af 4a f8 imul -0x8(%rdx),%ecx
4009eb: 48 39 d5 cmp %rdx,%rbp
4009ee: 8d 34 39 lea (%rcx,%rdi,1),%esi
4009f1: 89 f7 mov %esi,%edi
4009f3: 75 eb jne 4009e0 <main+0x90>
^^^^
4009f5: 44 89 c8 mov %r9d,%eax
4009f8: 0f a2 cpuid
4009fa: 0f 31 rdtsc
沉重的SSE2 /从icc展开(循环的一部分,1184次迭代,被矢量化,尾部在循环后处理):
400e4c: 33 c9 xor %ecx,%ecx
400e4e: 49 89 cd mov %rcx,%r13
400e51: 33 c0 xor %eax,%eax
400e53: 0f a2 cpuid
400e55: 0f 31 rdtsc
400e57: 66 0f ef c9 pxor %xmm1,%xmm1
400e5b: 66 0f 6f 05 7d 2f 00 movdqa 0x2f7d(%rip),%xmm0
400e62: 00
400e63: 41 89 c4 mov %eax,%r12d
400e66: 33 c0 xor %eax,%eax
vvvv
400e68: 66 0f 6f 9c c4 80 38 movdqa 0x13880(%rsp,%rax,8),%xmm3
400e6f: 01 00
400e71: 66 0f 6f 94 c4 90 38 movdqa 0x13890(%rsp,%rax,8),%xmm2
400e78: 01 00
400e7a: 66 0f 6f f3 movdqa %xmm3,%xmm6
400e7e: 66 0f 62 f2 punpckldq %xmm2,%xmm6
400e82: 66 0f 6a da punpckhdq %xmm2,%xmm3
400e86: 66 0f 6f fe movdqa %xmm6,%xmm7
400e8a: 66 0f 62 fb punpckldq %xmm3,%xmm7
400e8e: 66 0f 6f ac c4 a0 38 movdqa 0x138a0(%rsp,%rax,8),%xmm5
400e95: 01 00
400e97: 66 44 0f 6f d7 movdqa %xmm7,%xmm10
400e9c: 66 0f 6a f3 punpckhdq %xmm3,%xmm6
400ea0: 66 44 0f 6f c5 movdqa %xmm5,%xmm8
400ea5: 66 0f 6f a4 c4 b0 38 movdqa 0x138b0(%rsp,%rax,8),%xmm4
400eac: 01 00
400eae: 66 0f 73 d7 20 psrlq $0x20,%xmm7
400eb3: 66 44 0f f4 d6 pmuludq %xmm6,%xmm10
400eb8: 66 0f 73 d6 20 psrlq $0x20,%xmm6
400ebd: 66 0f f4 fe pmuludq %xmm6,%xmm7
400ec1: 66 44 0f 6f ac c4 c0 movdqa 0x138c0(%rsp,%rax,8),%xmm13
400ec8: 38 01 00
400ecb: 66 44 0f db d0 pand %xmm0,%xmm10
400ed0: 66 44 0f 62 c4 punpckldq %xmm4,%xmm8
400ed5: 66 45 0f 6f f5 movdqa %xmm13,%xmm14
400eda: 66 44 0f 6f a4 c4 d0 movdqa 0x138d0(%rsp,%rax,8),%xmm12
400ee1: 38 01 00
400ee4: 66 45 0f 6f c8 movdqa %xmm8,%xmm9
400ee9: 66 0f 6a ec punpckhdq %xmm4,%xmm5
400eed: 66 0f 73 f7 20 psllq $0x20,%xmm7
400ef2: 66 0f 6f a4 c4 e0 38 movdqa 0x138e0(%rsp,%rax,8),%xmm4
400ef9: 01 00
400efb: 66 44 0f eb d7 por %xmm7,%xmm10
400f00: 66 0f 6f 9c c4 f0 38 movdqa 0x138f0(%rsp,%rax,8),%xmm3
400f07: 01 00
400f09: 66 41 0f fe ca paddd %xmm10,%xmm1
400f0e: 66 44 0f 62 cd punpckldq %xmm5,%xmm9
400f13: 48 83 c0 10 add $0x10,%rax
400f17: 66 44 0f 6a c5 punpckhdq %xmm5,%xmm8
400f1c: 66 0f 6f ec movdqa %xmm4,%xmm5
400f20: 66 45 0f 62 f4 punpckldq %xmm12,%xmm14
400f25: 66 45 0f 6f d9 movdqa %xmm9,%xmm11
400f2a: 66 45 0f 6a ec punpckhdq %xmm12,%xmm13
400f2f: 66 45 0f 6f fe movdqa %xmm14,%xmm15
400f34: 66 0f 62 eb punpckldq %xmm3,%xmm5
400f38: 66 41 0f 73 d1 20 psrlq $0x20,%xmm9
400f3e: 66 45 0f 62 fd punpckldq %xmm13,%xmm15
400f43: 66 0f 6f f5 movdqa %xmm5,%xmm6
400f47: 66 0f 6a e3 punpckhdq %xmm3,%xmm4
400f4b: 66 41 0f 6f d7 movdqa %xmm15,%xmm2
400f50: 66 45 0f f4 d8 pmuludq %xmm8,%xmm11
400f55: 66 41 0f 73 d0 20 psrlq $0x20,%xmm8
400f5b: 66 45 0f f4 c8 pmuludq %xmm8,%xmm9
400f60: 66 45 0f 6a f5 punpckhdq %xmm13,%xmm14
400f65: 66 41 0f 73 d7 20 psrlq $0x20,%xmm15
400f6b: 66 0f 62 f4 punpckldq %xmm4,%xmm6
400f6f: 66 44 0f db d8 pand %xmm0,%xmm11
400f74: 66 41 0f f4 d6 pmuludq %xmm14,%xmm2
400f79: 66 41 0f 73 d6 20 psrlq $0x20,%xmm14
400f7f: 66 45 0f f4 fe pmuludq %xmm14,%xmm15
400f84: 66 0f 6a ec punpckhdq %xmm4,%xmm5
400f88: 66 0f 6f fe movdqa %xmm6,%xmm7
400f8c: 66 0f f4 fd pmuludq %xmm5,%xmm7
400f90: 66 0f 73 d6 20 psrlq $0x20,%xmm6
400f95: 66 0f 73 d5 20 psrlq $0x20,%xmm5
400f9a: 66 41 0f 73 f1 20 psllq $0x20,%xmm9
400fa0: 66 0f f4 f5 pmuludq %xmm5,%xmm6
400fa4: 66 45 0f eb d9 por %xmm9,%xmm11
400fa9: 66 0f db d0 pand %xmm0,%xmm2
400fad: 66 41 0f 73 f7 20 psllq $0x20,%xmm15
400fb3: 66 41 0f fe cb paddd %xmm11,%xmm1
400fb8: 66 41 0f eb d7 por %xmm15,%xmm2
400fbd: 66 0f db f8 pand %xmm0,%xmm7
400fc1: 66 0f 73 f6 20 psllq $0x20,%xmm6
400fc6: 66 0f fe ca paddd %xmm2,%xmm1
400fca: 66 0f eb fe por %xmm6,%xmm7
400fce: 66 0f fe cf paddd %xmm7,%xmm1
400fd2: 48 3d 50 02 00 00 cmp $0x250,%rax
400fd8: 0f 82 8a fe ff ff jb 400e68 <main+0xe8>
^^^^
400fde: 66 0f 6f c1 movdqa %xmm1,%xmm0
400fe2: 66 0f 73 d8 08 psrldq $0x8,%xmm0
400fe7: 66 0f fe c8 paddd %xmm0,%xmm1
400feb: 66 0f 6f d1 movdqa %xmm1,%xmm2
400fef: 8b 84 24 00 4b 01 00 mov 0x14b00(%rsp),%eax
400ff6: 66 0f 73 d2 20 psrlq $0x20,%xmm2
400ffb: 0f af 84 24 04 4b 01 imul 0x14b04(%rsp),%eax
401002: 00
401003: 66 0f fe ca paddd %xmm2,%xmm1
401007: 66 0f 7e cb movd %xmm1,%ebx
40100b: 8b 94 24 08 4b 01 00 mov 0x14b08(%rsp),%edx
401012: 03 d8 add %eax,%ebx
401014: 0f af 94 24 0c 4b 01 imul 0x14b0c(%rsp),%edx
40101b: 00
40101c: 8b b4 24 10 4b 01 00 mov 0x14b10(%rsp),%esi
401023: 03 da add %edx,%ebx
401025: 0f af b4 24 14 4b 01 imul 0x14b14(%rsp),%esi
40102c: 00
40102d: 8b bc 24 18 4b 01 00 mov 0x14b18(%rsp),%edi
401034: 03 de add %esi,%ebx
401036: 0f af bc 24 1c 4b 01 imul 0x14b1c(%rsp),%edi
40103d: 00
40103e: 44 8b 84 24 20 4b 01 mov 0x14b20(%rsp),%r8d
401045: 00
401046: 03 df add %edi,%ebx
401048: 44 0f af 84 24 24 4b imul 0x14b24(%rsp),%r8d
40104f: 01 00
401051: 44 8b 8c 24 28 4b 01 mov 0x14b28(%rsp),%r9d
401058: 00
401059: 41 03 d8 add %r8d,%ebx
40105c: 44 0f af 8c 24 2c 4b imul 0x14b2c(%rsp),%r9d
401063: 01 00
401065: 44 8b 94 24 30 4b 01 mov 0x14b30(%rsp),%r10d
40106c: 00
40106d: 41 03 d9 add %r9d,%ebx
401070: 44 0f af 94 24 34 4b imul 0x14b34(%rsp),%r10d
401077: 01 00
401079: 44 8b 9c 24 38 4b 01 mov 0x14b38(%rsp),%r11d
401080: 00
401081: 41 03 da add %r10d,%ebx
401084: 44 0f af 9c 24 3c 4b imul 0x14b3c(%rsp),%r11d
40108b: 01 00
40108d: 41 03 db add %r11d,%ebx
401090: e8 eb 00 00 00 callq 401180 <_Z5rdtscv>