我想了解如何使用PREFETCH *指令。 为此,我写了一些代码:
.model flat
.code
?fast_mem_copy_sse@@YAXPAH0H@Z PROC
MOV edi, [esp + 4] ; destination
MOV esi, [esp + 8] ; source
MOV ecx, [esp + 12] ; n bytes for copy
copy_loop_1:
MOVAPS xmm0, [edi + 0 * 4 * 4]
MOVAPS xmm1, [edi + 1 * 4 * 4]
MOVAPS xmm2, [edi + 2 * 4 * 4]
MOVAPS xmm3, [edi + 3 * 4 * 4]
MOVAPS xmm4, [edi + 4 * 4 * 4]
MOVAPS xmm5, [edi + 5 * 4 * 4]
MOVAPS xmm6, [edi + 6 * 4 * 4]
MOVAPS xmm7, [edi + 7 * 4 * 4]
MOVAPS [esi + 0 * 4 * 4], xmm0
MOVAPS [esi + 1 * 4 * 4], xmm1
MOVAPS [esi + 2 * 4 * 4], xmm2
MOVAPS [esi + 3 * 4 * 4], xmm3
MOVAPS [esi + 4 * 4 * 4], xmm4
MOVAPS [esi + 5 * 4 * 4], xmm5
MOVAPS [esi + 6 * 4 * 4], xmm6
MOVAPS [esi + 7 * 4 * 4], xmm7
ADD esi, 4*4*8
ADD edi, 4*4*8
SUB ecx, 4*8
JNZ copy_loop_1
RET
?fast_mem_copy_sse@@YAXPAH0H@Z ENDP
?fast_mem_copy_sse_movntdq@@YAXPAH0H@Z PROC
MOV edi, [esp + 4] ; destination
MOV esi, [esp + 8] ; source
MOV ecx, [esp + 12] ; n bytes for copy
copy_loop_2:
MOVAPS xmm0, [edi + 0 * 4 * 4]
MOVAPS xmm1, [edi + 1 * 4 * 4]
MOVAPS xmm2, [edi + 2 * 4 * 4]
MOVAPS xmm3, [edi + 3 * 4 * 4]
MOVAPS xmm4, [edi + 4 * 4 * 4]
MOVAPS xmm5, [edi + 5 * 4 * 4]
MOVAPS xmm6, [edi + 6 * 4 * 4]
MOVAPS xmm7, [edi + 7 * 4 * 4]
MOVNTDQ [esi + 0 * 4 * 4], xmm0
MOVNTDQ [esi + 1 * 4 * 4], xmm1
MOVNTDQ [esi + 2 * 4 * 4], xmm2
MOVNTDQ [esi + 3 * 4 * 4], xmm3
MOVNTDQ [esi + 4 * 4 * 4], xmm4
MOVNTDQ [esi + 5 * 4 * 4], xmm5
MOVNTDQ [esi + 6 * 4 * 4], xmm6
MOVNTDQ [esi + 7 * 4 * 4], xmm7
ADD esi, 4*4*8
ADD edi, 4*4*8
SUB ecx, 4*8
JNZ copy_loop_2
RET
?fast_mem_copy_sse_movntdq@@YAXPAH0H@Z ENDP
?fast_mem_copy_sse_prefetch@@YAXPAH0H@Z PROC
MOV edi, [esp + 4] ; destination
MOV esi, [esp + 8] ; source
MOV ecx, [esp + 12] ; n bytes for copy
copy_loop_3:
;PREFETCHT0 [edi + 0 * 4 * 4]
;PREFETCHT0 [edi + 1 * 4 * 4]
;PREFETCHT0 [edi + 2 * 4 * 4]
;PREFETCHT0 [edi + 3 * 4 * 4]
;PREFETCHT0 [edi + 4 * 4 * 4]
;PREFETCHT0 [edi + 5 * 4 * 4]
;PREFETCHT0 [edi + 6 * 4 * 4]
;PREFETCHT0 [edi + 7 * 4 * 4]
PREFETCHT0 [edi]
MOVAPS xmm0, [edi + 0 * 4 * 4]
MOVAPS xmm1, [edi + 1 * 4 * 4]
MOVAPS xmm2, [edi + 2 * 4 * 4]
MOVAPS xmm3, [edi + 3 * 4 * 4]
MOVAPS xmm4, [edi + 4 * 4 * 4]
MOVAPS xmm5, [edi + 5 * 4 * 4]
MOVAPS xmm6, [edi + 6 * 4 * 4]
MOVAPS xmm7, [edi + 7 * 4 * 4]
MOVAPS [esi + 0 * 4 * 4], xmm0
MOVAPS [esi + 1 * 4 * 4], xmm1
MOVAPS [esi + 2 * 4 * 4], xmm2
MOVAPS [esi + 3 * 4 * 4], xmm3
MOVAPS [esi + 4 * 4 * 4], xmm4
MOVAPS [esi + 5 * 4 * 4], xmm5
MOVAPS [esi + 6 * 4 * 4], xmm6
MOVAPS [esi + 7 * 4 * 4], xmm7
ADD esi, 4*4*8
ADD edi, 4*4*8
SUB ecx, 4*8
JNZ copy_loop_3
RET
?fast_mem_copy_sse_prefetch@@YAXPAH0H@Z ENDP
END
#include <string.h>
#include <iostream>
#include <time.h>
//#define CHECK
#define BLOCK_SIZE 8*8
#define AMOUNT_OF_BLOCKS 200*4
#define AMOUNT_OF_RUNS 100000
void fast_mem_copy_sse(int *dst, int *src, int n);
void fast_mem_copy_sse_movntdq(int *dst, int *src, int n);
void fast_mem_copy_sse_prefetch(int *dst, int *src, int n);
void fast_mem_copy(int *dst, int *src, int n)
{
for (int i = 0; i < n; i++) {
*(dst + i) = *(src + i);
}
}
int main()
{
clock_t t;
_declspec(align(16)) int a[AMOUNT_OF_BLOCKS*BLOCK_SIZE];
_declspec(align(16)) int b[AMOUNT_OF_BLOCKS*BLOCK_SIZE];
///////////////////////////////////////////////////////////////////////////////
t = clock();
for (int i = 0; i < AMOUNT_OF_RUNS; i++) {
memset(a, i, BLOCK_SIZE * AMOUNT_OF_BLOCKS * sizeof(int));
fast_mem_copy(b, a, BLOCK_SIZE * AMOUNT_OF_BLOCKS);
#ifdef CHECK
for (int j = 0; j < BLOCK_SIZE * AMOUNT_OF_BLOCKS; j++) {
if (a[j] != b[j]) {
std::cout << "fast_mem_copy work wrong; j = " << j << "\n";
}
}
#endif
}
t = clock() - t;
std::cout << "fast_mem_copy took me " << t << "clicks (" << ((float)t / CLOCKS_PER_SEC) << "seconds).\n";
///////////////////////////////////////////////////////////////////////////////
t = clock();
for (int i = 0; i < AMOUNT_OF_RUNS; i++) {
memset(a, i, BLOCK_SIZE * AMOUNT_OF_BLOCKS * sizeof(int));
fast_mem_copy_sse(b, a, BLOCK_SIZE * AMOUNT_OF_BLOCKS);
#ifdef CHECK
for (int j = 0; j < BLOCK_SIZE * AMOUNT_OF_BLOCKS; j++) {
if (a[j] != b[j]) {
std::cout << "fast_mem_copy_sse work wrong; j = " << j << "\n";
}
}
#endif
}
t = clock() - t;
std::cout << "fast_mem_copy_sse took me " << t << "clicks (" << ((float)t / CLOCKS_PER_SEC) << "seconds).\n";
///////////////////////////////////////////////////////////////////////////////
t = clock();
for (int i = 0; i < AMOUNT_OF_RUNS; i++) {
memset(a, i, BLOCK_SIZE * AMOUNT_OF_BLOCKS * sizeof(int));
fast_mem_copy_sse_movntdq(b, a, BLOCK_SIZE * AMOUNT_OF_BLOCKS);
#ifdef CHECK
for (int j = 0; j < BLOCK_SIZE * AMOUNT_OF_BLOCKS; j++) {
if (a[j] != b[j]) {
std::cout << "fast_mem_copy_sse_movntdq work wrong; j = " << j << "\n";
}
}
#endif
}
t = clock() - t;
std::cout << "fast_mem_copy_sse_movntdq took me " << t << "clicks (" << ((float)t / CLOCKS_PER_SEC) << "seconds).\n";
///////////////////////////////////////////////////////////////////////////////
t = clock();
for (int i = 0; i < AMOUNT_OF_RUNS; i++) {
memset(a, i, BLOCK_SIZE * AMOUNT_OF_BLOCKS * sizeof(int));
fast_mem_copy_sse_prefetch(b, a, BLOCK_SIZE * AMOUNT_OF_BLOCKS);
#ifdef CHECK
for (int j = 0; j < BLOCK_SIZE * AMOUNT_OF_BLOCKS; j++) {
if (a[j] != b[j]) {
std::cout << "fast_mem_copy_sse_prefetch work wrong; j = " << j << "\n";
}
}
#endif
}
t = clock() - t;
std::cout << "fast_mem_copy_sse_prefetch took me " << t << " clicks (" << ((float)t / CLOCKS_PER_SEC) << " seconds).\n";
system("PAUSE");
return 0;
}
我得到了以下结果:
fast_mem_copy took me 11262 clicks (11.262 seconds).
fast_mem_copy_sse took me 1940 clicks (1.94 seconds).
fast_mem_copy_sse_movntdq took me 3570 clicks (3.57 seconds).
fast_mem_copy_sse_prefetch took me 1970 clicks (1.97 seconds).
那有什么不对? 或者在fast_mem_copy_sse中使用硬件预取,并且没有任何意义使用指令进行预取? 我还使用了VTune,它告诉我没有缓存未命中。
答案 0 :(得分:6)
预取只有在你做得足够重要时才会有所帮助。我相信CPU速度已达到现在需要大约200个CPU周期从RAM获取的程度。有了像你这样的循环,你需要预先预取10次。
另外,如果您正在进行顺序访问的简单复制循环,则CPU硬件已经在为您执行预取。