我正在使用下一个程序来测试不同的简单操作'这是我测试将变量加载到寄存器中的速度:
#include <iostream>
#include <Windows.h>
void main()
{
DWORD _time;
int val = 1;
int* ptra = &val;
for (auto a = 0; a < 20; a++)
{
_time = GetTickCount();
for (auto i = 0; i < 100000000; i++)
{
_asm
{
mov eax, val
}
}
_time = GetTickCount() - _time;
std::cout << _time << std::endl;
};
std::cout << buf << std::endl;
system("pause");
for (auto a = 0; a < 20; a++)
{
_time = GetTickCount();
for (auto i = 0; i < 100000000; i++)
{
_asm
{
mov eax, dword ptr[ptra]
mov ebx, dword ptr[eax]
}
}
_time = GetTickCount() - _time;
std::cout << _time << std::endl;
};
std::cout << buf << std::endl;
system("pause");
}
我的电脑上的测试平均值 234 203
出于某种原因,它通过一个指针变得更快,然后是直接的。我已经在两台PC上测试了这个并得到了相同的结果。首先我认为某些CP优化正在运行,但是这个意味着,使用指针比变量本身更有效,听起来很尴尬。现在我想我做错了什么,可能是GetTickCount()的错误,无论如何,我没有找到任何可以帮助的东西要了解发生了什么,任何人都无法解释这一点。
双指针的结果与直接访问结果相同 指针链越长,工作越慢
用1替换val会增加一些速度,但比指针
慢空循环需要相同的时间才能完成。任何更改都不会影响行为。
这是我目前正在使用的代码
#include <iostream>
#include <Windows.h>
int* ptra;
int val;
void main()
{
DWORD _time;
val = 1;
ptra = &val;
HANDLE TH = GetCurrentThread();
HANDLE PH = GetCurrentProcess();
PDWORD_PTR APMask = new ULONG_PTR;
PDWORD_PTR ASMask = new ULONG_PTR;
ULONG_PTR Core = 1;
GetProcessAffinityMask(PH, APMask, ASMask);
while (!(Core && *APMask)) Core = Core << 1;
SetThreadAffinityMask(TH, Core);
SetThreadPriority(TH, THREAD_PRIORITY_TIME_CRITICAL);
SetPriorityClass(PH, REALTIME_PRIORITY_CLASS);
DWORD64 ProcessorTime;
#define order 0
#define loops 10000000
//=========================================
for (auto a = 0; a < 20; a++)
{
//_time = GetTickCount();
ProcessorTime = __rdtsc();
for (auto i = 0; i < loops; i++)
{
_asm
{
#if order == 1
mov eax, dword ptr[ptra]
mov ebx, dword ptr[eax]
#else
mov eax, val
#endif
}
}
//_time = GetTickCount() - _time;
ProcessorTime = __rdtsc() - ProcessorTime;
std::cout << ProcessorTime << std::endl;
};
//system("pause");
//=========================================
std::cout << "=" << std::endl;
//=========================================
for (auto a = 0; a < 20; a++)
{
//_time = GetTickCount();
ProcessorTime = __rdtsc();
for (auto i = 0; i < loops; i++)
{
_asm
{
#if order == 1
mov eax, val
#else
mov eax, dword ptr[ptra]
mov ebx, dword ptr[eax]
#endif
}
}
//_time = GetTickCount() - _time;
ProcessorTime = __rdtsc() - ProcessorTime;
std::cout << ProcessorTime << std::endl;
};
//=========================================
SetPriorityClass(PH, NORMAL_PRIORITY_CLASS);
SetThreadPriority(TH, THREAD_PRIORITY_NORMAL);
SetThreadAffinityMask(TH, *APMask);
system("pause");
}
Asm代码:
; 5 : {
push ebp
mov ebp, esp
sub esp, 124 ; 0000007cH
mov eax, DWORD PTR ___security_cookie
xor eax, ebp
mov DWORD PTR __$ArrayPad$[ebp], eax
push ebx
push esi
push edi
; 6 : //DWORD _time;
; 7 :
; 8 : int* ptra;
; 9 : int val;
; 10 :
; 11 : val = 1;
mov DWORD PTR _val$[ebp], 1
; 12 : ptra = &val;
lea eax, DWORD PTR _val$[ebp]
mov DWORD PTR _ptra$[ebp], eax
; 13 : short unsigned a;
; 14 :
; 15 : HANDLE TH = GetCurrentThread();
call DWORD PTR __imp__GetCurrentThread@0
mov DWORD PTR _TH$[ebp], eax
; 16 : HANDLE PH = GetCurrentProcess();
call DWORD PTR __imp__GetCurrentProcess@0
mov DWORD PTR _PH$[ebp], eax
; 17 : PDWORD_PTR APMask = new ULONG_PTR;
push 4
call ??2@YAPAXI@Z ; operator new
add esp, 4
mov DWORD PTR $T2[ebp], eax
mov eax, DWORD PTR $T2[ebp]
mov DWORD PTR _APMask$[ebp], eax
; 18 : PDWORD_PTR ASMask = new ULONG_PTR;
push 4
call ??2@YAPAXI@Z ; operator new
add esp, 4
mov DWORD PTR $T1[ebp], eax
mov eax, DWORD PTR $T1[ebp]
mov DWORD PTR _ASMask$[ebp], eax
; 19 : ULONG_PTR Core = 1;
mov DWORD PTR _Core$[ebp], 1
; 20 : GetProcessAffinityMask(PH, APMask, ASMask);
mov eax, DWORD PTR _ASMask$[ebp]
push eax
mov ecx, DWORD PTR _APMask$[ebp]
push ecx
mov edx, DWORD PTR _PH$[ebp]
push edx
call DWORD PTR __imp__GetProcessAffinityMask@12
$LN2@main:
; 21 : while (!(Core && *APMask)) Core = Core << 1;
cmp DWORD PTR _Core$[ebp], 0
je SHORT $LN16@main
mov eax, DWORD PTR _APMask$[ebp]
cmp DWORD PTR [eax], 0
jne SHORT $LN3@main
$LN16@main:
mov eax, DWORD PTR _Core$[ebp]
shl eax, 1
mov DWORD PTR _Core$[ebp], eax
jmp SHORT $LN2@main
$LN3@main:
; 22 : SetThreadAffinityMask(TH, Core);
mov eax, DWORD PTR _Core$[ebp]
push eax
mov ecx, DWORD PTR _TH$[ebp]
push ecx
call DWORD PTR __imp__SetThreadAffinityMask@8
; 23 : SetThreadPriority(TH, THREAD_PRIORITY_TIME_CRITICAL);
push 15 ; 0000000fH
mov eax, DWORD PTR _TH$[ebp]
push eax
call DWORD PTR __imp__SetThreadPriority@8
; 24 : SetPriorityClass(PH, REALTIME_PRIORITY_CLASS);
push 256 ; 00000100H
mov eax, DWORD PTR _PH$[ebp]
push eax
call DWORD PTR __imp__SetPriorityClass@8
; 25 : DWORD64 ProcessorTime;
; 26 : #define order 0
; 27 : #define loops 10000000
; 28 : #define tests 50
; 29 :
; 30 : //=========================================
; 31 : for (a = 0; a < tests; a++)
xor eax, eax
mov WORD PTR _a$[ebp], ax
jmp SHORT $LN6@main
$LN4@main:
mov ax, WORD PTR _a$[ebp]
add ax, 1
mov WORD PTR _a$[ebp], ax
$LN6@main:
movzx eax, WORD PTR _a$[ebp]
cmp eax, 50 ; 00000032H
jge SHORT $LN5@main
; 32 : {
; 33 : //_time = GetTickCount();
; 34 : ProcessorTime = __rdtsc();
rdtsc
mov DWORD PTR _ProcessorTime$[ebp], eax
mov DWORD PTR _ProcessorTime$[ebp+4], edx
; 35 : for (auto i = 0; i < loops; i++)
mov DWORD PTR _i$4[ebp], 0
jmp SHORT $LN9@main
$LN7@main:
mov eax, DWORD PTR _i$4[ebp]
add eax, 1
mov DWORD PTR _i$4[ebp], eax
$LN9@main:
cmp DWORD PTR _i$4[ebp], 10000000 ; 00989680H
jge SHORT $LN8@main
; 36 : {
; 37 : _asm
; 38 : {
; 39 : #if order == 1
; 40 : mov eax, dword ptr[ptra]
; 41 : mov ebx, dword ptr[eax]
; 42 : #else
; 43 : mov eax, val
mov eax, DWORD PTR _val$[ebp]
; 44 : #endif
; 45 : }
; 46 : }
jmp SHORT $LN7@main
$LN8@main:
; 47 : //_time = GetTickCount() - _time;
; 48 : ProcessorTime = __rdtsc() - ProcessorTime;
rdtsc
sub eax, DWORD PTR _ProcessorTime$[ebp]
sbb edx, DWORD PTR _ProcessorTime$[ebp+4]
mov DWORD PTR _ProcessorTime$[ebp], eax
mov DWORD PTR _ProcessorTime$[ebp+4], edx
; 49 : std::cout << ProcessorTime << std::endl;
push OFFSET ??$endl@DU?$char_traits@D@std@@@std@@YAAAV?$basic_ostream@DU?$char_traits@D@std@@@0@AAV10@@Z ; std::endl<char,std::char_traits<char> >
mov eax, DWORD PTR _ProcessorTime$[ebp+4]
push eax
mov ecx, DWORD PTR _ProcessorTime$[ebp]
push ecx
mov ecx, DWORD PTR __imp_?cout@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A
call DWORD PTR __imp_??6?$basic_ostream@DU?$char_traits@D@std@@@std@@QAEAAV01@_K@Z
mov ecx, eax
call DWORD PTR __imp_??6?$basic_ostream@DU?$char_traits@D@std@@@std@@QAEAAV01@P6AAAV01@AAV01@@Z@Z
; 50 : };
jmp SHORT $LN4@main
$LN5@main:
; 51 : //system("pause");
; 52 : //=========================================
; 53 : std::cout << "=" << std::endl;
push OFFSET ??$endl@DU?$char_traits@D@std@@@std@@YAAAV?$basic_ostream@DU?$char_traits@D@std@@@0@AAV10@@Z ; std::endl<char,std::char_traits<char> >
push OFFSET ??_C@_01NEMOKFLO@?$DN?$AA@
mov eax, DWORD PTR __imp_?cout@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A
push eax
call ??$?6U?$char_traits@D@std@@@std@@YAAAV?$basic_ostream@DU?$char_traits@D@std@@@0@AAV10@PBD@Z ; std::operator<<<std::char_traits<char> >
add esp, 8
mov ecx, eax
call DWORD PTR __imp_??6?$basic_ostream@DU?$char_traits@D@std@@@std@@QAEAAV01@P6AAAV01@AAV01@@Z@Z
; 54 : //=========================================
; 55 : for (a = 0; a < tests; a++)
xor eax, eax
mov WORD PTR _a$[ebp], ax
jmp SHORT $LN12@main
$LN10@main:
mov ax, WORD PTR _a$[ebp]
add ax, 1
mov WORD PTR _a$[ebp], ax
$LN12@main:
movzx eax, WORD PTR _a$[ebp]
cmp eax, 50 ; 00000032H
jge SHORT $LN11@main
; 56 : {
; 57 : //_time = GetTickCount();
; 58 : ProcessorTime = __rdtsc();
rdtsc
mov DWORD PTR _ProcessorTime$[ebp], eax
mov DWORD PTR _ProcessorTime$[ebp+4], edx
; 59 : for (auto i = 0; i < loops; i++)
mov DWORD PTR _i$3[ebp], 0
jmp SHORT $LN15@main
$LN13@main:
mov eax, DWORD PTR _i$3[ebp]
add eax, 1
mov DWORD PTR _i$3[ebp], eax
$LN15@main:
cmp DWORD PTR _i$3[ebp], 10000000 ; 00989680H
jge SHORT $LN14@main
; 60 : {
; 61 : _asm
; 62 : {
; 63 : #if order == 1
; 64 : mov eax, val
; 65 : #else
; 66 : mov eax, dword ptr[ptra]
mov eax, DWORD PTR _ptra$[ebp]
; 67 : mov ebx, dword ptr[eax]
mov ebx, DWORD PTR [eax]
; 68 : #endif
; 69 : }
; 70 : }
jmp SHORT $LN13@main
$LN14@main:
; 71 : //_time = GetTickCount() - _time;
; 72 : ProcessorTime = __rdtsc() - ProcessorTime;
rdtsc
sub eax, DWORD PTR _ProcessorTime$[ebp]
sbb edx, DWORD PTR _ProcessorTime$[ebp+4]
mov DWORD PTR _ProcessorTime$[ebp], eax
mov DWORD PTR _ProcessorTime$[ebp+4], edx
; 73 : std::cout << ProcessorTime << std::endl;
push OFFSET ??$endl@DU?$char_traits@D@std@@@std@@YAAAV?$basic_ostream@DU?$char_traits@D@std@@@0@AAV10@@Z ; std::endl<char,std::char_traits<char> >
mov eax, DWORD PTR _ProcessorTime$[ebp+4]
push eax
mov ecx, DWORD PTR _ProcessorTime$[ebp]
push ecx
mov ecx, DWORD PTR __imp_?cout@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A
call DWORD PTR __imp_??6?$basic_ostream@DU?$char_traits@D@std@@@std@@QAEAAV01@_K@Z
mov ecx, eax
call DWORD PTR __imp_??6?$basic_ostream@DU?$char_traits@D@std@@@std@@QAEAAV01@P6AAAV01@AAV01@@Z@Z
; 74 : };
jmp SHORT $LN10@main
$LN11@main:
; 75 : //=========================================
; 76 : SetPriorityClass(PH, NORMAL_PRIORITY_CLASS);
push 32 ; 00000020H
mov eax, DWORD PTR _PH$[ebp]
push eax
call DWORD PTR __imp__SetPriorityClass@8
; 77 : SetThreadPriority(TH, THREAD_PRIORITY_NORMAL);
push 0
mov eax, DWORD PTR _TH$[ebp]
push eax
call DWORD PTR __imp__SetThreadPriority@8
; 78 : SetThreadAffinityMask(TH, *APMask);
mov eax, DWORD PTR _APMask$[ebp]
mov ecx, DWORD PTR [eax]
push ecx
mov edx, DWORD PTR _TH$[ebp]
push edx
call DWORD PTR __imp__SetThreadAffinityMask@8
; 79 : system("pause");
push OFFSET ??_C@_05PDJBBECF@pause?$AA@
call DWORD PTR __imp__system
add esp, 4
; 80 : }
jmp SHORT $LN19@main
jmp SHORT $LN18@main
$LN19@main:
xor eax, eax
$LN18@main:
pop edi
pop esi
pop ebx
mov ecx, DWORD PTR __$ArrayPad$[ebp]
xor ecx, ebp
call @__security_check_cookie@4
mov esp, ebp
pop ebp
ret 0
我在MSVC ++ 14.0中得到了这个,所有优化都被废除了(它削减了所有无目标代码,测试代码也是如此)。
答案 0 :(得分:0)
您不会在相关代码中使用循环对齐,这可能会在结果中产生足够的不变性,使其毫无意义。 $LN13@main:
和$LN7@main:
必须以相同的方式对齐(取决于您的平台,多少,实际上您应该在ProcessorTime = __rdtsc();
之前对齐整个块并且可能跳转到该地址这两种情况(在清理缓存或改变它们之后,取决于你想要测量的内容)。
也就是说,在现代x86上,你的两个代码都处于指令级别,甚至不会使单核饱和,因此这种mov
很可能与现实代码中的其他指令一起执行。所有&#34;时间&#34;通过访问堆栈内存和依赖项来吃掉。例如,在第二种间接情况下,如果你只使用eax
作为两个mov的目的地,也许它会因为更大的冲突而减慢一点..嗯...可能不是,CPU仍有许多备用寄存器每个新的循环迭代,所以它可能会通过每次重命名eax
来避免任何假的依赖/冲突。
总的来说,测量脚手架&#34;是没有意义的,你应该测量真正的算法做某事,因为你测量的东西几乎不会再增加+1个周期,这很可能丢失了几十个/几百个循环的真正代码瓶颈。