了解Intel CPU的4K别名

时间:2019-01-29 06:33:57

标签: performance x86 intel cpu-architecture

我一直在阅读有关因Intel CPU的地址位6至11含糊不清而导致的加载/存储重叠导致的4K别名。因此,我试图编写各种简单的测试(在i7-3770k,Win7、64位,VS2017上)专门引起该问题,以确保我在实践中理解它。

我一直在尝试但未能证明其行为的第一个测试是:

void Test4KAliasing1()
{
    typedef float Value;// Also tried with double


    const uint32_t ValueCount = 1024;
    const uint32_t OffsetCount = 256;
    const uint32_t TestCount = 512;


    Value* a = (Value*)_aligned_malloc(ValueCount * sizeof(Value), 4096);
    Value* b = (Value*)_aligned_malloc(ValueCount * sizeof(Value), 4096);


    for (uint32_t i = 0; i < ValueCount; ++i)
        a[i] = b[i] = (Value)rand();

    for (uint32_t offset = 0; offset < OffsetCount; ++offset)
    {
        uint64_t startTime = StartCPUCycles();


        for (uint32_t test = 0; test < TestCount; ++test)
        {
            for (uint32_t i = 0; i < ValueCount; ++i)
            {
                uint32_t j = (offset + i) % ValueCount;


                a[i] += b[j] * 3.142f;
            }
        }


        uint64_t duration = EndCPUCycles() - startTime;


        printf("time: %llu\toffset: %u ", duration / TestCount, offset);
        printf("\n", a, b);
    }


    _aligned_free(b);
    _aligned_free(a);
}

灵感来自于http://richardstartin.uk/the-much-aligned-garbage-collector/

所以我不太确定为什么从最终的计时结果来看并没有显示出问题?就像我认为的那样,由于乱序执行,存储会在循环迭代中加载到不明确的地址或从不明确的地址进行加载?

生成的程序集为:

000000013F2510E4  cpuid  
000000013F2510E6  rdtsc  
000000013F2510E8  shl         rdx,20h  
000000013F2510EC  mov         r9d,200h  
000000013F2510F2  or          rax,rdx  
000000013F2510F5  mov         r10,rax  
000000013F2510F8  nop         dword ptr [rax+rax]  
000000013F251100  lea         ebx,[rsi+1]  
000000013F251103  mov         r8d,80h  
000000013F251109  lea         rdx,[r14+8]  
000000013F25110D  nop         dword ptr [rax]  
000000013F251110  mov         rax,rbx  
000000013F251113  lea         ecx,[rbx-1]  
000000013F251116  and         eax,3FFh  
000000013F25111B  lea         rdx,[rdx+20h]  
000000013F25111F  and         ecx,3FFh  
000000013F251125  vmulss      xmm1,xmm6,dword ptr [rdi+rcx*4]  
000000013F25112A  vaddss      xmm2,xmm1,dword ptr [rdx-28h]  
000000013F25112F  vmovss      dword ptr [rdx-28h],xmm2  
000000013F251134  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F251139  vaddss      xmm2,xmm1,dword ptr [rdx-24h]  
000000013F25113E  vmovss      dword ptr [rdx-24h],xmm2  
000000013F251143  lea         eax,[rbx+1]  
000000013F251146  and         eax,3FFh  
000000013F25114B  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F251150  vaddss      xmm2,xmm1,dword ptr [rdx-20h]  
000000013F251155  vmovss      dword ptr [rdx-20h],xmm2  
000000013F25115A  lea         eax,[rbx+2]  
000000013F25115D  and         eax,3FFh  
000000013F251162  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F251167  vaddss      xmm2,xmm1,dword ptr [rdx-1Ch]  
000000013F25116C  vmovss      dword ptr [rdx-1Ch],xmm2  
000000013F251171  lea         eax,[rbx+3]  
000000013F251174  and         eax,3FFh  
000000013F251179  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F25117E  vaddss      xmm2,xmm1,dword ptr [rdx-18h]  
000000013F251183  vmovss      dword ptr [rdx-18h],xmm2  
000000013F251188  lea         eax,[rbx+4]  
000000013F25118B  and         eax,3FFh  
000000013F251190  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F251195  vaddss      xmm2,xmm1,dword ptr [rdx-14h]  
000000013F25119A  vmovss      dword ptr [rdx-14h],xmm2  
000000013F25119F  lea         eax,[rbx+5]  
000000013F2511A2  and         eax,3FFh  
000000013F2511A7  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F2511AC  vaddss      xmm2,xmm1,dword ptr [rdx-10h]  
000000013F2511B1  lea         eax,[rbx+6]  
000000013F2511B4  add         ebx,8  
000000013F2511B7  vmovss      dword ptr [rdx-10h],xmm2  
000000013F2511BC  and         eax,3FFh  
000000013F2511C1  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F2511C6  vaddss      xmm2,xmm1,dword ptr [rdx-0Ch]  
000000013F2511CB  vmovss      dword ptr [rdx-0Ch],xmm2  
000000013F2511D0  sub         r8,1  
000000013F2511D4  jne         Test4KAliasing1+0B0h (013F251110h)  
000000013F2511DA  sub         r9,1  
000000013F2511DE  jne         Test4KAliasing1+0A0h (013F251100h)  
000000013F2511E4  rdtsc  

我还在网上看到各种描述,说最低的12位必须匹配才能发生这种混叠,而在其他地方,只有6到11位?由于最低的6位是高速缓存行中的字节索引,而且所有内容都是基于高速缓存行的,所以我想它只需要6至11位就可以匹配?

编辑:

也根据彼得斯的回答,我尝试过:

a[i] *= 1.234f;
b[j] += 4.321f;

似乎没有显示出问题并生成:

000000013F6C10E8  cpuid  
000000013F6C10EA  rdtsc  
000000013F6C10EC  shl         rdx,20h  
000000013F6C10F0  mov         ebx,200h  
000000013F6C10F5  or          rax,rdx  
000000013F6C10F8  mov         r9,rax  
000000013F6C10FB  nop         dword ptr [rax+rax]  
000000013F6C1100  lea         edx,[rsi+1]  
000000013F6C1103  mov         r8d,80h  
000000013F6C1109  lea         rcx,[r14+8]  
000000013F6C110D  nop         dword ptr [rax]  
000000013F6C1110  vmulss      xmm1,xmm6,dword ptr [rcx-8]  
000000013F6C1115  vmovss      dword ptr [rcx-8],xmm1  
000000013F6C111A  lea         eax,[rdx-1]  
000000013F6C111D  and         eax,3FFh  
000000013F6C1122  lea         rcx,[rcx+20h]  
000000013F6C1126  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]  
000000013F6C112B  vmovss      dword ptr [rdi+rax*4],xmm1  
000000013F6C1130  vmulss      xmm1,xmm6,dword ptr [rcx-24h]  
000000013F6C1135  vmovss      dword ptr [rcx-24h],xmm1  
000000013F6C113A  mov         rax,rdx  
000000013F6C113D  and         eax,3FFh  
000000013F6C1142  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]  
000000013F6C1147  vmovss      dword ptr [rdi+rax*4],xmm1  
000000013F6C114C  vmulss      xmm0,xmm6,dword ptr [rcx-20h]  
000000013F6C1151  lea         eax,[rdx+1]  
000000013F6C1154  and         eax,3FFh  
000000013F6C1159  vmovss      dword ptr [rcx-20h],xmm0  
000000013F6C115E  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]  
000000013F6C1163  vmovss      dword ptr [rdi+rax*4],xmm1  
000000013F6C1168  vmulss      xmm1,xmm6,dword ptr [rcx-1Ch]  
000000013F6C116D  vmovss      dword ptr [rcx-1Ch],xmm1  
000000013F6C1172  lea         eax,[rdx+2]  
000000013F6C1175  and         eax,3FFh  
000000013F6C117A  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]  
000000013F6C117F  vmovss      dword ptr [rdi+rax*4],xmm1  
000000013F6C1184  vmulss      xmm1,xmm6,dword ptr [rcx-18h]  
000000013F6C1189  vmovss      dword ptr [rcx-18h],xmm1  
000000013F6C118E  lea         eax,[rdx+3]  
000000013F6C1191  and         eax,3FFh  
000000013F6C1196  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]  
000000013F6C119B  vmovss      dword ptr [rdi+rax*4],xmm1  
000000013F6C11A0  vmulss      xmm1,xmm6,dword ptr [rcx-14h]  
000000013F6C11A5  vmovss      dword ptr [rcx-14h],xmm1  
000000013F6C11AA  lea         eax,[rdx+4]  
000000013F6C11AD  and         eax,3FFh  
000000013F6C11B2  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]  
000000013F6C11B7  vmovss      dword ptr [rdi+rax*4],xmm1  
000000013F6C11BC  vmulss      xmm1,xmm6,dword ptr [rcx-10h]  
000000013F6C11C1  lea         eax,[rdx+5]  
000000013F6C11C4  and         eax,3FFh  
000000013F6C11C9  vmovss      dword ptr [rcx-10h],xmm1  
000000013F6C11CE  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]  
000000013F6C11D3  vmovss      dword ptr [rdi+rax*4],xmm1  
000000013F6C11D8  vmulss      xmm1,xmm6,dword ptr [rcx-0Ch]  
000000013F6C11DD  lea         eax,[rdx+6]  
000000013F6C11E0  add         edx,8  
000000013F6C11E3  and         eax,3FFh  
000000013F6C11E8  vmovss      dword ptr [rcx-0Ch],xmm1  
000000013F6C11ED  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]  
000000013F6C11F2  vmovss      dword ptr [rdi+rax*4],xmm1  
000000013F6C11F7  sub         r8,1  
000000013F6C11FB  jne         Test4KAliasing1+0B0h (013F6C1110h)  
000000013F6C1201  sub         rbx,1  
000000013F6C1205  jne         Test4KAliasing1+0A0h (013F6C1100h)  
000000013F6C120B  rdtsc 

还基于彼得提到的链接问题,我尝试了3个数组:

a[i] += b[j] + c[j];

似乎也没有问题。生成的代码是:

000000013F5110F6  cpuid  
000000013F5110F8  rdtsc  
000000013F5110FA  shl         rdx,20h  
000000013F5110FE  mov         r8d,200h  
000000013F511104  or          rax,rdx  
000000013F511107  mov         r10,rax  
000000013F51110A  nop         word ptr [rax+rax]  
000000013F511110  lea         ebx,[rbp+1]  
000000013F511113  mov         r9d,100h  
000000013F511119  lea         rdx,[r13+8]  
000000013F51111D  nop         dword ptr [rax]  
000000013F511120  mov         rax,rbx  
000000013F511123  lea         ecx,[rbx-1]  
000000013F511126  and         eax,7FFh  
000000013F51112B  lea         rdx,[rdx+20h]  
000000013F51112F  and         ecx,7FFh  
000000013F511135  vmovss      xmm0,dword ptr [rsi+rcx*4]  
000000013F51113A  vaddss      xmm1,xmm0,dword ptr [rdi+rcx*4]  
000000013F51113F  vaddss      xmm2,xmm1,dword ptr [rdx-28h]  
000000013F511144  vmovss      dword ptr [rdx-28h],xmm2  
000000013F511149  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F51114E  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F511153  vaddss      xmm2,xmm1,dword ptr [rdx-24h]  
000000013F511158  vmovss      dword ptr [rdx-24h],xmm2  
000000013F51115D  lea         eax,[rbx+1]  
000000013F511160  and         eax,7FFh  
000000013F511165  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F51116A  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F51116F  vaddss      xmm2,xmm1,dword ptr [rdx-20h]  
000000013F511174  vmovss      dword ptr [rdx-20h],xmm2  
000000013F511179  lea         eax,[rbx+2]  
000000013F51117C  and         eax,7FFh  
000000013F511181  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F511186  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F51118B  vaddss      xmm2,xmm1,dword ptr [rdx-1Ch]  
000000013F511190  vmovss      dword ptr [rdx-1Ch],xmm2  
000000013F511195  lea         eax,[rbx+3]  
000000013F511198  and         eax,7FFh  
000000013F51119D  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F5111A2  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F5111A7  vaddss      xmm2,xmm1,dword ptr [rdx-18h]  
000000013F5111AC  vmovss      dword ptr [rdx-18h],xmm2  
000000013F5111B1  lea         eax,[rbx+4]  
000000013F5111B4  and         eax,7FFh  
000000013F5111B9  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F5111BE  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F5111C3  vaddss      xmm2,xmm1,dword ptr [rdx-14h]  
000000013F5111C8  vmovss      dword ptr [rdx-14h],xmm2  
000000013F5111CD  lea         eax,[rbx+5]  
000000013F5111D0  and         eax,7FFh  
000000013F5111D5  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F5111DA  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F5111DF  vaddss      xmm2,xmm1,dword ptr [rdx-10h]  
000000013F5111E4  lea         eax,[rbx+6]  
000000013F5111E7  add         ebx,8  
000000013F5111EA  vmovss      dword ptr [rdx-10h],xmm2  
000000013F5111EF  and         eax,7FFh  
000000013F5111F4  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F5111F9  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F5111FE  vaddss      xmm2,xmm1,dword ptr [rdx-0Ch]  
000000013F511203  vmovss      dword ptr [rdx-0Ch],xmm2  
000000013F511208  sub         r9,1  
000000013F51120C  jne         Test4KAliasing2+0C0h (013F511120h)  
000000013F511212  sub         r8,1  
000000013F511216  jne         Test4KAliasing2+0B0h (013F511110h)  
000000013F51121C  rdtsc  

对彼得对他的回答的评论/更新,我尝试过:

a[i] *= 1.234f;
b[i] += 4.321f;

没有显示问题。注意:在大多数此类尝试中,我尝试使用i = j + i + offset来改变i的偏移量,以便在以前的大多数尝试中看到能找到缓解该问题的偏移量。 (由于x86生锈,我仍在这里进行反汇编以了解地址生成)。

000000013F7D1104  cpuid  
000000013F7D1106  rdtsc  
000000013F7D1108  shl         rdx,20h  
000000013F7D110C  or          rax,rdx  
000000013F7D110F  mov         edx,200h  
000000013F7D1114  mov         rbx,rax  
000000013F7D1117  cmp         rsi,r15  
000000013F7D111A  ja          Test4KAliasing1+130h (013F7D1190h)  
000000013F7D111C  cmp         rbp,r14  
000000013F7D111F  jb          Test4KAliasing1+130h (013F7D1190h)  
000000013F7D1121  lea         rcx,[rsi+4]  
000000013F7D1125  mov         eax,100h  
000000013F7D112A  nop         word ptr [rax+rax]  
000000013F7D1130  vmulss      xmm1,xmm6,dword ptr [rdi+rcx-4]  
000000013F7D1136  vmovss      dword ptr [rdi+rcx-4],xmm1  
000000013F7D113C  vaddss      xmm1,xmm7,dword ptr [rcx-4]  
000000013F7D1141  vmovss      dword ptr [rcx-4],xmm1  
000000013F7D1146  vmulss      xmm1,xmm6,dword ptr [rcx+rdi]  
000000013F7D114B  vmovss      dword ptr [rcx+rdi],xmm1  
000000013F7D1150  vaddss      xmm0,xmm7,dword ptr [rcx]  
000000013F7D1154  vmovss      dword ptr [rcx],xmm0  
000000013F7D1158  vmulss      xmm0,xmm6,dword ptr [rdi+rcx+4]  
000000013F7D115E  vmovss      dword ptr [rdi+rcx+4],xmm0  
000000013F7D1164  vaddss      xmm0,xmm7,dword ptr [rcx+4]  
000000013F7D1169  vmovss      dword ptr [rcx+4],xmm0  
000000013F7D116E  vmulss      xmm0,xmm6,dword ptr [rdi+rcx+8]  
000000013F7D1174  vmovss      dword ptr [rdi+rcx+8],xmm0  
000000013F7D117A  vaddss      xmm0,xmm7,dword ptr [rcx+8]  
000000013F7D117F  vmovss      dword ptr [rcx+8],xmm0  
000000013F7D1184  lea         rcx,[rcx+10h]  
000000013F7D1188  sub         rax,1  
000000013F7D118C  jne         Test4KAliasing1+0D0h (013F7D1130h)  
000000013F7D118E  jmp         Test4KAliasing1+1AEh (013F7D120Eh)  
000000013F7D1190  vmovups     xmm2,xmmword ptr [__xmm@3f9df3b63f9df3b63f9df3b63f9df3b6 (013F7EA0E0h)]  
000000013F7D1198  vmovups     xmm3,xmmword ptr [__xmm@408a45a2408a45a2408a45a2408a45a2 (013F7EA0F0h)]  
000000013F7D11A0  lea         rax,[rsi+10h]  
000000013F7D11A4  mov         ecx,40h  
000000013F7D11A9  nop         dword ptr [rax]  
000000013F7D11B0  vmulps      xmm1,xmm2,xmmword ptr [rdi+rax-10h]  
000000013F7D11B6  vmovups     xmmword ptr [rdi+rax-10h],xmm1  
000000013F7D11BC  vaddps      xmm1,xmm3,xmmword ptr [rax-10h]  
000000013F7D11C1  vmovups     xmmword ptr [rax-10h],xmm1  
000000013F7D11C6  vmulps      xmm1,xmm2,xmmword ptr [rdi+rax]  
000000013F7D11CB  vmovups     xmmword ptr [rdi+rax],xmm1  
000000013F7D11D0  vaddps      xmm1,xmm3,xmmword ptr [rax]  
000000013F7D11D4  vmovups     xmmword ptr [rax],xmm1  
000000013F7D11D8  vmulps      xmm1,xmm2,xmmword ptr [rdi+rax+10h]  
000000013F7D11DE  vmovups     xmmword ptr [rdi+rax+10h],xmm1  
000000013F7D11E4  vaddps      xmm1,xmm3,xmmword ptr [rax+10h]  
000000013F7D11E9  vmovups     xmmword ptr [rax+10h],xmm1  
000000013F7D11EE  vmulps      xmm1,xmm2,xmmword ptr [rdi+rax+20h]  
000000013F7D11F4  vmovups     xmmword ptr [rdi+rax+20h],xmm1  
000000013F7D11FA  vaddps      xmm1,xmm3,xmmword ptr [rax+20h]  
000000013F7D11FF  vmovups     xmmword ptr [rax+20h],xmm1  
000000013F7D1204  lea         rax,[rax+40h]  
000000013F7D1208  sub         rcx,1  
000000013F7D120C  jne         Test4KAliasing1+150h (013F7D11B0h)  
000000013F7D120E  sub         rdx,1  
000000013F7D1212  jne         Test4KAliasing1+0B7h (013F7D1117h)  
000000013F7D1218  rdtsc  

典型的计时运行时间:

a[i] *= 1.234f;
b[i] += 4.321f;

是:

time: 715       offset: 0
time: 647       offset: 1
time: 641       offset: 2
time: 703       offset: 3
time: 658       offset: 4
time: 657       offset: 5
time: 656       offset: 6
time: 657       offset: 7
time: 658       offset: 8
time: 657       offset: 9
time: 658       offset: 10
time: 653       offset: 11
time: 658       offset: 12
time: 652       offset: 13
time: 658       offset: 14
time: 657       offset: 15
time: 658       offset: 16
time: 656       offset: 17
time: 659       offset: 18
time: 656       offset: 19
time: 656       offset: 20
time: 656       offset: 21
time: 663       offset: 22
time: 657       offset: 23
time: 657       offset: 24
time: 704       offset: 25
time: 714       offset: 26
time: 657       offset: 27
time: 658       offset: 28
time: 658       offset: 29
time: 656       offset: 30
time: 656       offset: 31
time: 657       offset: 32
time: 658       offset: 33
time: 658       offset: 34
time: 656       offset: 35
time: 658       offset: 36
time: 658       offset: 37
time: 658       offset: 38
time: 658       offset: 39
time: 660       offset: 40
time: 660       offset: 41
time: 664       offset: 42
time: 656       offset: 43
time: 656       offset: 44
time: 658       offset: 45
time: 656       offset: 46
time: 656       offset: 47
time: 713       offset: 48
time: 658       offset: 49
time: 663       offset: 50
time: 662       offset: 51
time: 665       offset: 52
time: 663       offset: 53
time: 665       offset: 54
time: 658       offset: 55
time: 658       offset: 56
time: 658       offset: 57
time: 656       offset: 58
time: 657       offset: 59
time: 658       offset: 60
time: 658       offset: 61
time: 656       offset: 62
time: 666       offset: 63
time: 656       offset: 64
time: 658       offset: 65
time: 656       offset: 66
time: 657       offset: 67
time: 658       offset: 68
time: 658       offset: 69
time: 652       offset: 70
time: 658       offset: 71
time: 657       offset: 72
time: 658       offset: 73
time: 658       offset: 74
time: 656       offset: 75
time: 658       offset: 76
time: 665       offset: 77
time: 657       offset: 78
time: 656       offset: 79
time: 656       offset: 80
time: 666       offset: 81
time: 656       offset: 82
time: 702       offset: 83
time: 640       offset: 84
time: 640       offset: 85
time: 657       offset: 86
time: 657       offset: 87
time: 658       offset: 88
time: 658       offset: 89
time: 656       offset: 90
time: 657       offset: 91
time: 657       offset: 92
time: 657       offset: 93
time: 658       offset: 94
time: 662       offset: 95
time: 658       offset: 96
time: 656       offset: 97
time: 657       offset: 98
time: 663       offset: 99
time: 660       offset: 100
time: 663       offset: 101
time: 657       offset: 102
time: 656       offset: 103
time: 664       offset: 104
time: 659       offset: 105
time: 659       offset: 106
time: 658       offset: 107
time: 774       offset: 108
time: 707       offset: 109
time: 710       offset: 110
time: 658       offset: 111
time: 657       offset: 112
time: 661       offset: 113
time: 658       offset: 114
time: 656       offset: 115
time: 658       offset: 116
time: 657       offset: 117
time: 658       offset: 118
time: 660       offset: 119
time: 666       offset: 120
time: 657       offset: 121
time: 658       offset: 122
time: 651       offset: 123
time: 658       offset: 124
time: 657       offset: 125
time: 657       offset: 126
time: 658       offset: 127
time: 656       offset: 128
time: 658       offset: 129
time: 656       offset: 130
time: 658       offset: 131
time: 645       offset: 132
time: 640       offset: 133
time: 640       offset: 134
time: 659       offset: 135
time: 664       offset: 136
time: 658       offset: 137
time: 662       offset: 138
time: 656       offset: 139
time: 658       offset: 140
time: 656       offset: 141
time: 658       offset: 142
time: 660       offset: 143
time: 658       offset: 144
time: 658       offset: 145
time: 656       offset: 146
time: 657       offset: 147
time: 664       offset: 148
time: 656       offset: 149
time: 656       offset: 150
time: 658       offset: 151
time: 656       offset: 152
time: 668       offset: 153
time: 656       offset: 154
time: 656       offset: 155
time: 656       offset: 156
time: 658       offset: 157
time: 656       offset: 158
time: 658       offset: 159
time: 660       offset: 160
time: 658       offset: 161
time: 658       offset: 162
time: 658       offset: 163
time: 658       offset: 164
time: 656       offset: 165
time: 686       offset: 166
time: 656       offset: 167
time: 656       offset: 168
time: 658       offset: 169
time: 656       offset: 170
time: 658       offset: 171
time: 656       offset: 172
time: 656       offset: 173
time: 656       offset: 174
time: 658       offset: 175
time: 656       offset: 176
time: 658       offset: 177
time: 658       offset: 178
time: 654       offset: 179
time: 639       offset: 180
time: 639       offset: 181
time: 639       offset: 182
time: 657       offset: 183
time: 641       offset: 184
time: 640       offset: 185
time: 640       offset: 186
time: 640       offset: 187
time: 640       offset: 188
time: 640       offset: 189
time: 640       offset: 190
time: 700       offset: 191
time: 715       offset: 192
time: 657       offset: 193
time: 657       offset: 194
time: 662       offset: 195
time: 703       offset: 196
time: 640       offset: 197
time: 639       offset: 198
time: 638       offset: 199
time: 640       offset: 200
time: 640       offset: 201
time: 640       offset: 202
time: 704       offset: 203
time: 638       offset: 204
time: 640       offset: 205
time: 639       offset: 206
time: 657       offset: 207
time: 658       offset: 208
time: 657       offset: 209
time: 659       offset: 210
time: 663       offset: 211
time: 658       offset: 212
time: 658       offset: 213
time: 657       offset: 214
time: 667       offset: 215
time: 657       offset: 216
time: 657       offset: 217
time: 658       offset: 218
time: 657       offset: 219
time: 656       offset: 220
time: 661       offset: 221
time: 651       offset: 222
time: 658       offset: 223
time: 658       offset: 224
time: 656       offset: 225
time: 658       offset: 226
time: 658       offset: 227
time: 672       offset: 228
time: 658       offset: 229
time: 656       offset: 230
time: 649       offset: 231
time: 665       offset: 232
time: 657       offset: 233
time: 652       offset: 234
time: 664       offset: 235
time: 656       offset: 236
time: 662       offset: 237
time: 658       offset: 238
time: 665       offset: 239
time: 658       offset: 240
time: 657       offset: 241
time: 656       offset: 242
time: 658       offset: 243
time: 657       offset: 244
time: 658       offset: 245
time: 658       offset: 246
time: 656       offset: 247
time: 658       offset: 248
time: 656       offset: 249
time: 658       offset: 250
time: 656       offset: 251
time: 665       offset: 252
time: 658       offset: 253
time: 656       offset: 254
time: 658       offset: 255

但是:我认为我犯了一个错误,现在已经发现了:

a[i] *= 1.234f;
b[j] += 4.321f;

现在是典型的计时运行:

time: 2794      offset: 0
time: 2737      offset: 1
time: 2655      offset: 2
time: 2748      offset: 3
time: 2605      offset: 4
time: 2730      offset: 5
time: 2665      offset: 6
time: 2703      offset: 7
time: 2571      offset: 8
time: 2558      offset: 9
time: 2213      offset: 10
time: 2200      offset: 11
time: 2325      offset: 12
time: 2200      offset: 13
time: 2200      offset: 14
time: 2264      offset: 15
time: 2264      offset: 16
time: 2355      offset: 17
time: 2348      offset: 18
time: 2262      offset: 19
time: 2260      offset: 20
time: 2262      offset: 21
time: 2260      offset: 22
time: 2490      offset: 23
time: 2261      offset: 24
time: 2260      offset: 25
time: 2255      offset: 26
time: 2261      offset: 27
time: 2263      offset: 28
time: 2260      offset: 29
time: 2260      offset: 30
time: 2262      offset: 31
time: 2264      offset: 32
time: 2355      offset: 33
time: 2266      offset: 34
time: 2270      offset: 35
time: 2260      offset: 36
time: 2268      offset: 37
time: 2260      offset: 38
time: 2260      offset: 39
time: 2262      offset: 40
time: 2260      offset: 41
time: 2259      offset: 42
time: 2260      offset: 43
time: 2260      offset: 44
time: 2255      offset: 45
time: 2260      offset: 46
time: 2265      offset: 47
time: 2263      offset: 48
time: 2355      offset: 49
time: 2293      offset: 50
time: 2204      offset: 51
time: 2323      offset: 52
time: 2200      offset: 53
time: 2200      offset: 54
time: 2460      offset: 55
time: 2200      offset: 56

随着偏移量变大,可能会有大约20%的差异?

2 个答案:

答案 0 :(得分:3)

您似乎已经使用/ O2和/ arch:AVX选项编译了代码。编译器已经展开了8次内部循环,您可以看到vmulss/vaddss/vmovss有8个序列。 ab数组的地址分别存储在寄存器rdxrdi中。第一条指令加载单个元素并将其乘以常数,第二条指令将结果与另一个数组中的相应元素相加,而第三条指令将结果存储到同一位置。考虑两个这样的序列:

000000013F251125  vmulss      xmm1,xmm6,dword ptr [rdi+rcx*4]  
000000013F25112A  vaddss      xmm2,xmm1,dword ptr [rdx-28h]  
000000013F25112F  vmovss      dword ptr [rdx-28h],xmm2  
000000013F251134  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F251139  vaddss      xmm2,xmm1,dword ptr [rdx-24h]  
000000013F25113E  vmovss      dword ptr [rdx-24h],xmm2  

rcxrax分别被初始化为零和一。 rdx初始化为数组a的基地址加0x28。在不失一般性的前提下,假设ab的基地址分别为0x1000和0x2000。那么按程序顺序的内存访问顺序为:

load 0x2000
load 0x1000
store 0x1000
load 0x2004
load 0x1004
store 0x1004

是否有任何负载使以前的商店成为别名?请注意,您弄错了别名规则(请参见Peter的答案和此处的链接答案)。在load 0x2004store 0x1000中,位5-11不同。在load 0x1004store 0x1000中,位5-11相等。但是,访问不会重叠,因此没有别名。现在应该很容易明白为什么整个循环中没有负载会别名一个先前的商店。

如果将内循环主体更改为:

a[i] *= 1.234f;
b[j] += 4.321f;

然后来自每个数组的每对元素的序列变为vmulss/vmovss/vaddss/vmovss。前两个序列的内存访问配置文件是:

load 0x1000
store 0x1000
load 0x2000
store 0x1000

很明显,第二个加载别名第一个存储。由于j索引的计算方式,我尚不清楚在所有迭代中是否都会出现别名。但是在这种情况下它们确实存在。相反,使用索引i将确保所有迭代的每个vmulss/vmovss/vaddss/vmovss都有一个别名负载。

  

所以我不太确定为什么这不能显示问题。   产生的时间?

执行时间只能告诉您程序执行的速度,而不是原因。可观察到的4K别名损失取决于周围的代码和别名条件的总数。另外,对于不同的循环体,生成的二进制代码也不同,因此,如果执行时间发生了变化,则不一定意味着这是因为4K别名。是的,4K混叠可能是一个因素,甚至可能是一个主要因素,但是可能还有其他因素影响了执行时间。 top-down microarchitecture analysis methodology使您能够确定对执行时间影响最大​​的因素。

通常,确定4K别名是否发生的最佳方法是测量ld_blocks_partial.address_alias。您可以使用Processor Counter Monitor(在Windows,Linux和OSX上运行)来测量Intel PMU事件。我建议您这样做以验证我的分析是正确的。

答案 1 :(得分:2)

后12位为[11 : 0]位。第11位是第12位,因为我们从0开始计数。

CPU使用字节粒度检测加载/存储别名,而不仅仅是加载是否访问与较旧存储相同的缓存行。存储到array[1]不会减慢array[2]的负载;这对于性能确实是非常不利的,因为循环遍历一个数组并一次将每个元素RMW是一个很常见的模式。 (无需软件流水线就可以在存储位置之前加载多个元素。)

所以我认为您在这里没有遇到问题,因为您只是在从4k页面中的相同偏移量加载到后存储到位置。像这样简单的循环(不需要额外的跨步或偏移到另一个额外的页面中,只需在不同页面中使用两个数组即可。)

for (i = 0 ; i < limit ; i++) {
    a[i] *= 1.234;
    b[i] += 4.321;    // load from the same offset we just wrote, but in another page
}

,并且编译器在加载a之前将asm存储到b,因为ab相对于a具有相同的对齐方式,您会遇到问题。 4k页。

(如果编译器证明a != b,则编译器既可以在存储之前进行加载,也可以在运行执行该操作的循环版本之前发出要进行检查的代码。或者自动进行矢量化和/或展开)通过矢量宽度乘以展开因子来检查重叠。)

这不是一个完美的例子,但是从b来的负载依赖于a的存储,应该使乱序执行至少要努力工作才能隐藏这么大的延迟。


创建4k别名的另一种简便方法是从src = srcpagedst = dstpage + 16或类似的东西,当然srcpage和dstpage都是页面对齐的。 dst[i]的存储类似于dstpage[i+16](以字节为单位,而不是C元素的大小),因此在从dst[i]加载之前,将(按程序顺序)存储src[i+16]。当循环达到该i值时,负载将被4k别名阻止。

有关示例,请参见L1 memory bandwidth: 50% drop in efficiency using addresses which differ by 4096+64 bytes,使用@HadiBrais对包括IvyBridge(例如i7-3770k)在内的CPU进行性能分析。