如何通过指针然后直接访问变量?

时间:2017-03-15 12:49:58

标签: c++ performance pointers assembly

我正在使用下一个程序来测试不同的简单操作'这是我测试将变量加载到寄存器中的速度:

#include <iostream>
#include <Windows.h>

void main()
{
    DWORD _time;

    int val = 1;
    int* ptra = &val;
    for (auto a = 0; a < 20; a++)
    {
        _time = GetTickCount();
        for (auto i = 0; i < 100000000; i++)
        {
            _asm
            {
                mov         eax, val
            }
        }
        _time = GetTickCount() - _time;
        std::cout << _time << std::endl;
    };
    std::cout << buf << std::endl;
    system("pause");
    for (auto a = 0; a < 20; a++)
    {
        _time = GetTickCount();
        for (auto i = 0; i < 100000000; i++)
        {
            _asm
            {
                mov     eax, dword ptr[ptra]
                mov     ebx, dword ptr[eax]
            }
        }
        _time = GetTickCount() - _time;
        std::cout << _time << std::endl;
    };
    std::cout << buf << std::endl;
    system("pause");
}

我的电脑上的测试平均值 234 203

出于某种原因,它通过一个指针变得更快,然后是直接的。我已经在两台PC上测试了这个并得到了相同的结果。首先我认为某些CP优化正在运行,但是这个意味着,使用指针比变量本身更有效,听起来很尴尬。现在我想我做错了什么,可能是GetTickCount()的错误,无论如何,我没有找到任何可以帮助的东西要了解发生了什么,任何人都无法解释这一点。

双指针的结果与直接访问结果相同 指针链越长,工作越慢

用1替换val会增加一些速度,但比指针

空循环需要相同的时间才能完成。任何更改都不会影响行为。

这是我目前正在使用的代码

#include <iostream>
#include <Windows.h>

int* ptra;
int val;

void main()
{
    DWORD _time;

    val = 1;
    ptra = &val;

    HANDLE TH = GetCurrentThread();
    HANDLE PH = GetCurrentProcess();
    PDWORD_PTR APMask = new ULONG_PTR;
    PDWORD_PTR ASMask = new ULONG_PTR;
    ULONG_PTR Core = 1;
    GetProcessAffinityMask(PH, APMask, ASMask);
    while (!(Core && *APMask)) Core = Core << 1;
    SetThreadAffinityMask(TH, Core);
    SetThreadPriority(TH, THREAD_PRIORITY_TIME_CRITICAL);
    SetPriorityClass(PH, REALTIME_PRIORITY_CLASS);
    DWORD64 ProcessorTime;
#define order 0
#define loops 10000000

    //=========================================
    for (auto a = 0; a < 20; a++)
    {
        //_time = GetTickCount();
        ProcessorTime = __rdtsc();
        for (auto i = 0; i < loops; i++)
        {
            _asm
            {
#if order == 1
                mov     eax, dword ptr[ptra]
                mov     ebx, dword ptr[eax]
#else
                mov         eax, val
#endif
            }
        }
        //_time = GetTickCount() - _time;
        ProcessorTime = __rdtsc() - ProcessorTime;
        std::cout << ProcessorTime << std::endl;
    };
    //system("pause");
    //=========================================
    std::cout << "=" << std::endl;
    //=========================================
    for (auto a = 0; a < 20; a++)
    {
        //_time = GetTickCount();
        ProcessorTime = __rdtsc();
        for (auto i = 0; i < loops; i++)
        {
            _asm
            {
#if order == 1
                mov         eax, val
#else
                mov     eax, dword ptr[ptra]
                mov     ebx, dword ptr[eax]
#endif
            }
        }
        //_time = GetTickCount() - _time;
        ProcessorTime = __rdtsc() - ProcessorTime;
        std::cout << ProcessorTime << std::endl;
    };
    //=========================================
    SetPriorityClass(PH, NORMAL_PRIORITY_CLASS);
    SetThreadPriority(TH, THREAD_PRIORITY_NORMAL);
    SetThreadAffinityMask(TH, *APMask);
    system("pause");
}

Asm代码:

; 5    : {

    push    ebp
    mov ebp, esp
    sub esp, 124                ; 0000007cH
    mov eax, DWORD PTR ___security_cookie
    xor eax, ebp
    mov DWORD PTR __$ArrayPad$[ebp], eax
    push    ebx
    push    esi
    push    edi

; 6    :    //DWORD _time;
; 7    : 
; 8    :    int* ptra;
; 9    :    int val;
; 10   : 
; 11   :    val = 1;

    mov DWORD PTR _val$[ebp], 1

; 12   :    ptra = &val;

    lea eax, DWORD PTR _val$[ebp]
    mov DWORD PTR _ptra$[ebp], eax

; 13   :    short unsigned a;
; 14   : 
; 15   :    HANDLE TH = GetCurrentThread();

    call    DWORD PTR __imp__GetCurrentThread@0
    mov DWORD PTR _TH$[ebp], eax

; 16   :    HANDLE PH = GetCurrentProcess();

    call    DWORD PTR __imp__GetCurrentProcess@0
    mov DWORD PTR _PH$[ebp], eax

; 17   :    PDWORD_PTR APMask = new ULONG_PTR;

    push    4
    call    ??2@YAPAXI@Z                ; operator new
    add esp, 4
    mov DWORD PTR $T2[ebp], eax
    mov eax, DWORD PTR $T2[ebp]
    mov DWORD PTR _APMask$[ebp], eax

; 18   :    PDWORD_PTR ASMask = new ULONG_PTR;

    push    4
    call    ??2@YAPAXI@Z                ; operator new
    add esp, 4
    mov DWORD PTR $T1[ebp], eax
    mov eax, DWORD PTR $T1[ebp]
    mov DWORD PTR _ASMask$[ebp], eax

; 19   :    ULONG_PTR Core = 1;

    mov DWORD PTR _Core$[ebp], 1

; 20   :    GetProcessAffinityMask(PH, APMask, ASMask);

    mov eax, DWORD PTR _ASMask$[ebp]
    push    eax
    mov ecx, DWORD PTR _APMask$[ebp]
    push    ecx
    mov edx, DWORD PTR _PH$[ebp]
    push    edx
    call    DWORD PTR __imp__GetProcessAffinityMask@12
$LN2@main:

; 21   :    while (!(Core && *APMask)) Core = Core << 1;

    cmp DWORD PTR _Core$[ebp], 0
    je  SHORT $LN16@main
    mov eax, DWORD PTR _APMask$[ebp]
    cmp DWORD PTR [eax], 0
    jne SHORT $LN3@main
$LN16@main:
    mov eax, DWORD PTR _Core$[ebp]
    shl eax, 1
    mov DWORD PTR _Core$[ebp], eax
    jmp SHORT $LN2@main
$LN3@main:

; 22   :    SetThreadAffinityMask(TH, Core);

    mov eax, DWORD PTR _Core$[ebp]
    push    eax
    mov ecx, DWORD PTR _TH$[ebp]
    push    ecx
    call    DWORD PTR __imp__SetThreadAffinityMask@8

; 23   :    SetThreadPriority(TH, THREAD_PRIORITY_TIME_CRITICAL);

    push    15                  ; 0000000fH
    mov eax, DWORD PTR _TH$[ebp]
    push    eax
    call    DWORD PTR __imp__SetThreadPriority@8

; 24   :    SetPriorityClass(PH, REALTIME_PRIORITY_CLASS);

    push    256                 ; 00000100H
    mov eax, DWORD PTR _PH$[ebp]
    push    eax
    call    DWORD PTR __imp__SetPriorityClass@8

; 25   :    DWORD64 ProcessorTime;
; 26   : #define order 0
; 27   : #define loops 10000000
; 28   : #define tests 50
; 29   : 
; 30   :    //=========================================
; 31   :    for (a = 0; a < tests; a++)

    xor eax, eax
    mov WORD PTR _a$[ebp], ax
    jmp SHORT $LN6@main
$LN4@main:
    mov ax, WORD PTR _a$[ebp]
    add ax, 1
    mov WORD PTR _a$[ebp], ax
$LN6@main:
    movzx   eax, WORD PTR _a$[ebp]
    cmp eax, 50                 ; 00000032H
    jge SHORT $LN5@main

; 32   :    {
; 33   :        //_time = GetTickCount();
; 34   :        ProcessorTime = __rdtsc();

    rdtsc
    mov DWORD PTR _ProcessorTime$[ebp], eax
    mov DWORD PTR _ProcessorTime$[ebp+4], edx

; 35   :        for (auto i = 0; i < loops; i++)

    mov DWORD PTR _i$4[ebp], 0
    jmp SHORT $LN9@main
$LN7@main:
    mov eax, DWORD PTR _i$4[ebp]
    add eax, 1
    mov DWORD PTR _i$4[ebp], eax
$LN9@main:
    cmp DWORD PTR _i$4[ebp], 10000000       ; 00989680H
    jge SHORT $LN8@main

; 36   :        {
; 37   :            _asm
; 38   :            {
; 39   : #if order == 1
; 40   :                mov     eax, dword ptr[ptra]
; 41   :                mov     ebx, dword ptr[eax]
; 42   : #else
; 43   :                mov         eax, val

    mov eax, DWORD PTR _val$[ebp]

; 44   : #endif
; 45   :            }
; 46   :        }

    jmp SHORT $LN7@main
$LN8@main:

; 47   :        //_time = GetTickCount() - _time;
; 48   :        ProcessorTime = __rdtsc() - ProcessorTime;

    rdtsc
    sub eax, DWORD PTR _ProcessorTime$[ebp]
    sbb edx, DWORD PTR _ProcessorTime$[ebp+4]
    mov DWORD PTR _ProcessorTime$[ebp], eax
    mov DWORD PTR _ProcessorTime$[ebp+4], edx

; 49   :        std::cout << ProcessorTime << std::endl;

    push    OFFSET ??$endl@DU?$char_traits@D@std@@@std@@YAAAV?$basic_ostream@DU?$char_traits@D@std@@@0@AAV10@@Z ; std::endl<char,std::char_traits<char> >
    mov eax, DWORD PTR _ProcessorTime$[ebp+4]
    push    eax
    mov ecx, DWORD PTR _ProcessorTime$[ebp]
    push    ecx
    mov ecx, DWORD PTR __imp_?cout@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A
    call    DWORD PTR __imp_??6?$basic_ostream@DU?$char_traits@D@std@@@std@@QAEAAV01@_K@Z
    mov ecx, eax
    call    DWORD PTR __imp_??6?$basic_ostream@DU?$char_traits@D@std@@@std@@QAEAAV01@P6AAAV01@AAV01@@Z@Z

; 50   :    };

    jmp SHORT $LN4@main
$LN5@main:

; 51   :    //system("pause");
; 52   :    //=========================================
; 53   :    std::cout << "=" << std::endl;

    push    OFFSET ??$endl@DU?$char_traits@D@std@@@std@@YAAAV?$basic_ostream@DU?$char_traits@D@std@@@0@AAV10@@Z ; std::endl<char,std::char_traits<char> >
    push    OFFSET ??_C@_01NEMOKFLO@?$DN?$AA@
    mov eax, DWORD PTR __imp_?cout@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A
    push    eax
    call    ??$?6U?$char_traits@D@std@@@std@@YAAAV?$basic_ostream@DU?$char_traits@D@std@@@0@AAV10@PBD@Z ; std::operator<<<std::char_traits<char> >
    add esp, 8
    mov ecx, eax
    call    DWORD PTR __imp_??6?$basic_ostream@DU?$char_traits@D@std@@@std@@QAEAAV01@P6AAAV01@AAV01@@Z@Z

; 54   :    //=========================================
; 55   :    for (a = 0; a < tests; a++)

    xor eax, eax
    mov WORD PTR _a$[ebp], ax
    jmp SHORT $LN12@main
$LN10@main:
    mov ax, WORD PTR _a$[ebp]
    add ax, 1
    mov WORD PTR _a$[ebp], ax
$LN12@main:
    movzx   eax, WORD PTR _a$[ebp]
    cmp eax, 50                 ; 00000032H
    jge SHORT $LN11@main

; 56   :    {
; 57   :        //_time = GetTickCount();
; 58   :        ProcessorTime = __rdtsc();

    rdtsc
    mov DWORD PTR _ProcessorTime$[ebp], eax
    mov DWORD PTR _ProcessorTime$[ebp+4], edx

; 59   :        for (auto i = 0; i < loops; i++)

    mov DWORD PTR _i$3[ebp], 0
    jmp SHORT $LN15@main
$LN13@main:
    mov eax, DWORD PTR _i$3[ebp]
    add eax, 1
    mov DWORD PTR _i$3[ebp], eax
$LN15@main:
    cmp DWORD PTR _i$3[ebp], 10000000       ; 00989680H
    jge SHORT $LN14@main

; 60   :        {
; 61   :            _asm
; 62   :            {
; 63   : #if order == 1
; 64   :                mov         eax, val
; 65   : #else
; 66   :                mov     eax, dword ptr[ptra]

    mov eax, DWORD PTR _ptra$[ebp]

; 67   :                mov     ebx, dword ptr[eax]

    mov ebx, DWORD PTR [eax]

; 68   : #endif
; 69   :            }
; 70   :        }

    jmp SHORT $LN13@main
$LN14@main:

; 71   :        //_time = GetTickCount() - _time;
; 72   :        ProcessorTime = __rdtsc() - ProcessorTime;

    rdtsc
    sub eax, DWORD PTR _ProcessorTime$[ebp]
    sbb edx, DWORD PTR _ProcessorTime$[ebp+4]
    mov DWORD PTR _ProcessorTime$[ebp], eax
    mov DWORD PTR _ProcessorTime$[ebp+4], edx

; 73   :        std::cout << ProcessorTime << std::endl;

    push    OFFSET ??$endl@DU?$char_traits@D@std@@@std@@YAAAV?$basic_ostream@DU?$char_traits@D@std@@@0@AAV10@@Z ; std::endl<char,std::char_traits<char> >
    mov eax, DWORD PTR _ProcessorTime$[ebp+4]
    push    eax
    mov ecx, DWORD PTR _ProcessorTime$[ebp]
    push    ecx
    mov ecx, DWORD PTR __imp_?cout@std@@3V?$basic_ostream@DU?$char_traits@D@std@@@1@A
    call    DWORD PTR __imp_??6?$basic_ostream@DU?$char_traits@D@std@@@std@@QAEAAV01@_K@Z
    mov ecx, eax
    call    DWORD PTR __imp_??6?$basic_ostream@DU?$char_traits@D@std@@@std@@QAEAAV01@P6AAAV01@AAV01@@Z@Z

; 74   :    };

    jmp SHORT $LN10@main
$LN11@main:

; 75   :    //=========================================
; 76   :    SetPriorityClass(PH, NORMAL_PRIORITY_CLASS);

    push    32                  ; 00000020H
    mov eax, DWORD PTR _PH$[ebp]
    push    eax
    call    DWORD PTR __imp__SetPriorityClass@8

; 77   :    SetThreadPriority(TH, THREAD_PRIORITY_NORMAL);

    push    0
    mov eax, DWORD PTR _TH$[ebp]
    push    eax
    call    DWORD PTR __imp__SetThreadPriority@8

; 78   :    SetThreadAffinityMask(TH, *APMask);

    mov eax, DWORD PTR _APMask$[ebp]
    mov ecx, DWORD PTR [eax]
    push    ecx
    mov edx, DWORD PTR _TH$[ebp]
    push    edx
    call    DWORD PTR __imp__SetThreadAffinityMask@8

; 79   :    system("pause");

    push    OFFSET ??_C@_05PDJBBECF@pause?$AA@
    call    DWORD PTR __imp__system
    add esp, 4

; 80   : }

    jmp SHORT $LN19@main
    jmp SHORT $LN18@main
$LN19@main:
    xor eax, eax
$LN18@main:
    pop edi
    pop esi
    pop ebx
    mov ecx, DWORD PTR __$ArrayPad$[ebp]
    xor ecx, ebp
    call    @__security_check_cookie@4
    mov esp, ebp
    pop ebp
    ret 0

我在MSVC ++ 14.0中得到了这个,所有优化都被废除了(它削减了所有无目标代码,测试代码也是如此)。

1 个答案:

答案 0 :(得分:0)

您不会在相关代码中使用循环对齐,这可能会在结果中产生足够的不变性,使其毫无意义。 $LN13@main:$LN7@main:必须以相同的方式对齐(取决于您的平台,多少,实际上您应该在ProcessorTime = __rdtsc();之前对齐整个块并且可能跳转到该地址这两种情况(在清理缓存或改变它们之后,取决于你想要测量的内容)。

也就是说,在现代x86上,你的两个代码都处于指令级别,甚至不会使单核饱和,因此这种mov很可能与现实代码中的其他指令一起执行。所有&#34;时间&#34;通过访问堆栈内存和依赖项来吃掉。例如,在第二种间接情况下,如果你只使用eax作为两个mov的目的地,也许它会因为更大的冲突而减慢一点..嗯...可能不是,CPU仍有许多备用寄存器每个新的循环迭代,所以它可能会通过每次重命名eax来避免任何假的依赖/冲突。

总的来说,测量脚手架&#34;是没有意义的,你应该测量真正的算法做某事,因为你测量的东西几乎不会再增加+1个周期,这很可能丢失了几十个/几百个循环的真正代码瓶颈。