使用预取优化

时间:2014-04-09 19:34:19

标签: optimization assembly sse

我想了解如何使用PREFETCH *指令。 为此,我写了一些代码:

.model flat

.code

?fast_mem_copy_sse@@YAXPAH0H@Z PROC
    MOV     edi, [esp + 4]  ; destination
    MOV     esi, [esp + 8]  ; source
    MOV     ecx, [esp + 12] ; n bytes for copy  

copy_loop_1:
    MOVAPS  xmm0, [edi + 0 * 4 * 4]
    MOVAPS  xmm1, [edi + 1 * 4 * 4]
    MOVAPS  xmm2, [edi + 2 * 4 * 4]
    MOVAPS  xmm3, [edi + 3 * 4 * 4]
    MOVAPS  xmm4, [edi + 4 * 4 * 4]
    MOVAPS  xmm5, [edi + 5 * 4 * 4]
    MOVAPS  xmm6, [edi + 6 * 4 * 4]
    MOVAPS  xmm7, [edi + 7 * 4 * 4]

    MOVAPS  [esi + 0 * 4 * 4], xmm0
    MOVAPS  [esi + 1 * 4 * 4], xmm1
    MOVAPS  [esi + 2 * 4 * 4], xmm2
    MOVAPS  [esi + 3 * 4 * 4], xmm3
    MOVAPS  [esi + 4 * 4 * 4], xmm4
    MOVAPS  [esi + 5 * 4 * 4], xmm5
    MOVAPS  [esi + 6 * 4 * 4], xmm6
    MOVAPS  [esi + 7 * 4 * 4], xmm7

    ADD     esi, 4*4*8
    ADD     edi, 4*4*8

    SUB     ecx, 4*8
    JNZ     copy_loop_1

    RET
?fast_mem_copy_sse@@YAXPAH0H@Z ENDP

?fast_mem_copy_sse_movntdq@@YAXPAH0H@Z PROC
    MOV     edi, [esp + 4]  ; destination
    MOV     esi, [esp + 8]  ; source
    MOV     ecx, [esp + 12] ; n bytes for copy  

copy_loop_2:

    MOVAPS  xmm0, [edi + 0 * 4 * 4]
    MOVAPS  xmm1, [edi + 1 * 4 * 4]
    MOVAPS  xmm2, [edi + 2 * 4 * 4]
    MOVAPS  xmm3, [edi + 3 * 4 * 4]
    MOVAPS  xmm4, [edi + 4 * 4 * 4]
    MOVAPS  xmm5, [edi + 5 * 4 * 4]
    MOVAPS  xmm6, [edi + 6 * 4 * 4]
    MOVAPS  xmm7, [edi + 7 * 4 * 4]

    MOVNTDQ [esi + 0 * 4 * 4], xmm0
    MOVNTDQ [esi + 1 * 4 * 4], xmm1
    MOVNTDQ [esi + 2 * 4 * 4], xmm2
    MOVNTDQ [esi + 3 * 4 * 4], xmm3
    MOVNTDQ [esi + 4 * 4 * 4], xmm4
    MOVNTDQ [esi + 5 * 4 * 4], xmm5
    MOVNTDQ [esi + 6 * 4 * 4], xmm6
    MOVNTDQ [esi + 7 * 4 * 4], xmm7

    ADD     esi, 4*4*8
    ADD     edi, 4*4*8

    SUB     ecx, 4*8
    JNZ     copy_loop_2

    RET
?fast_mem_copy_sse_movntdq@@YAXPAH0H@Z ENDP

?fast_mem_copy_sse_prefetch@@YAXPAH0H@Z PROC
    MOV     edi, [esp + 4]  ; destination
    MOV     esi, [esp + 8]  ; source
    MOV     ecx, [esp + 12] ; n bytes for copy  

copy_loop_3:
    ;PREFETCHT0 [edi + 0 * 4 * 4]
    ;PREFETCHT0 [edi + 1 * 4 * 4]
    ;PREFETCHT0 [edi + 2 * 4 * 4]
    ;PREFETCHT0 [edi + 3 * 4 * 4]
    ;PREFETCHT0 [edi + 4 * 4 * 4]
    ;PREFETCHT0 [edi + 5 * 4 * 4]
    ;PREFETCHT0 [edi + 6 * 4 * 4]
    ;PREFETCHT0 [edi + 7 * 4 * 4]
    PREFETCHT0 [edi]


    MOVAPS  xmm0, [edi + 0 * 4 * 4]
    MOVAPS  xmm1, [edi + 1 * 4 * 4]
    MOVAPS  xmm2, [edi + 2 * 4 * 4]
    MOVAPS  xmm3, [edi + 3 * 4 * 4]
    MOVAPS  xmm4, [edi + 4 * 4 * 4]
    MOVAPS  xmm5, [edi + 5 * 4 * 4]
    MOVAPS  xmm6, [edi + 6 * 4 * 4]
    MOVAPS  xmm7, [edi + 7 * 4 * 4]

    MOVAPS  [esi + 0 * 4 * 4], xmm0
    MOVAPS  [esi + 1 * 4 * 4], xmm1
    MOVAPS  [esi + 2 * 4 * 4], xmm2
    MOVAPS  [esi + 3 * 4 * 4], xmm3
    MOVAPS  [esi + 4 * 4 * 4], xmm4
    MOVAPS  [esi + 5 * 4 * 4], xmm5
    MOVAPS  [esi + 6 * 4 * 4], xmm6
    MOVAPS  [esi + 7 * 4 * 4], xmm7

    ADD     esi, 4*4*8
    ADD     edi, 4*4*8

    SUB     ecx, 4*8
    JNZ     copy_loop_3

    RET
?fast_mem_copy_sse_prefetch@@YAXPAH0H@Z ENDP

END

#include <string.h>
#include <iostream>
#include <time.h>

//#define CHECK

#define BLOCK_SIZE          8*8
#define AMOUNT_OF_BLOCKS    200*4
#define AMOUNT_OF_RUNS      100000

void fast_mem_copy_sse(int *dst, int *src, int n);
void fast_mem_copy_sse_movntdq(int *dst, int *src, int n);
void fast_mem_copy_sse_prefetch(int *dst, int *src, int n);

void fast_mem_copy(int *dst, int *src, int n)
{
    for (int i = 0; i < n; i++) {
        *(dst + i) = *(src + i);
    }
}

int main() 
{
    clock_t t;

    _declspec(align(16)) int a[AMOUNT_OF_BLOCKS*BLOCK_SIZE];
    _declspec(align(16)) int b[AMOUNT_OF_BLOCKS*BLOCK_SIZE];

///////////////////////////////////////////////////////////////////////////////
    t = clock();
    for (int i = 0; i < AMOUNT_OF_RUNS; i++) {
        memset(a, i, BLOCK_SIZE * AMOUNT_OF_BLOCKS * sizeof(int));
        fast_mem_copy(b, a, BLOCK_SIZE * AMOUNT_OF_BLOCKS);

#ifdef CHECK
        for (int j = 0; j < BLOCK_SIZE * AMOUNT_OF_BLOCKS; j++) {
            if (a[j] != b[j]) {
                std::cout << "fast_mem_copy work wrong; j = " << j << "\n";
            }
        }
#endif
    }

    t = clock() - t;
    std::cout << "fast_mem_copy took me " << t << "clicks (" << ((float)t / CLOCKS_PER_SEC) << "seconds).\n";

///////////////////////////////////////////////////////////////////////////////
    t = clock();
    for (int i = 0; i < AMOUNT_OF_RUNS; i++) {
        memset(a, i, BLOCK_SIZE * AMOUNT_OF_BLOCKS * sizeof(int));
        fast_mem_copy_sse(b, a, BLOCK_SIZE * AMOUNT_OF_BLOCKS); 

#ifdef CHECK
        for (int j = 0; j < BLOCK_SIZE * AMOUNT_OF_BLOCKS; j++) {
            if (a[j] != b[j]) {
                std::cout << "fast_mem_copy_sse work wrong; j = " << j << "\n";
            }
        }
#endif
    }
    t = clock() - t;
    std::cout << "fast_mem_copy_sse took me " << t << "clicks (" << ((float)t / CLOCKS_PER_SEC) << "seconds).\n";

///////////////////////////////////////////////////////////////////////////////
    t = clock();
    for (int i = 0; i < AMOUNT_OF_RUNS; i++) {
        memset(a, i, BLOCK_SIZE * AMOUNT_OF_BLOCKS * sizeof(int));
        fast_mem_copy_sse_movntdq(b, a, BLOCK_SIZE * AMOUNT_OF_BLOCKS);

#ifdef CHECK
        for (int j = 0; j < BLOCK_SIZE * AMOUNT_OF_BLOCKS; j++) {
            if (a[j] != b[j]) {
                std::cout << "fast_mem_copy_sse_movntdq work wrong; j = " << j << "\n";
            }
        }
#endif
    }
    t = clock() - t;
    std::cout << "fast_mem_copy_sse_movntdq took me " << t << "clicks (" << ((float)t / CLOCKS_PER_SEC) << "seconds).\n";

///////////////////////////////////////////////////////////////////////////////
    t = clock();
    for (int i = 0; i < AMOUNT_OF_RUNS; i++) {
        memset(a, i, BLOCK_SIZE * AMOUNT_OF_BLOCKS * sizeof(int));
        fast_mem_copy_sse_prefetch(b, a, BLOCK_SIZE * AMOUNT_OF_BLOCKS);

#ifdef CHECK
        for (int j = 0; j < BLOCK_SIZE * AMOUNT_OF_BLOCKS; j++) {
            if (a[j] != b[j]) {
                std::cout << "fast_mem_copy_sse_prefetch work wrong; j = " << j << "\n";
            }
        }
#endif
    }
    t = clock() - t;
    std::cout << "fast_mem_copy_sse_prefetch took me " << t << " clicks (" << ((float)t / CLOCKS_PER_SEC) << " seconds).\n";

    system("PAUSE");
    return 0;
}

我得到了以下结果:

fast_mem_copy took me 11262 clicks (11.262 seconds).
fast_mem_copy_sse took me 1940 clicks (1.94 seconds).
fast_mem_copy_sse_movntdq took me 3570 clicks (3.57 seconds).
fast_mem_copy_sse_prefetch took me 1970 clicks (1.97 seconds).

那有什么不对? 或者在fast_mem_copy_sse中使用硬件预取,并且没有任何意义使用指令进行预取? 我还使用了VTune,它告诉我没有缓存未命中。

1 个答案:

答案 0 :(得分:6)

预取只有在你做得足够重要时才会有所帮助。我相信CPU速度已达到现在需要大约200个CPU周期从RAM获取的程度。有了像你这样的循环,你需要预先预取10次。

另外,如果您正在进行顺序访问的简单复制循环,则CPU硬件已经在为您执行预取。