为什么Godbolt与我在Visual Studio中的实际asm代码生成不同的asm输出?

时间:2019-01-04 08:13:51

标签: c++ assembly visual-c++ visual-studio-2017

这是godbolt生成的代码。

这是Visual Studio在我的main.asm文件上生成的代码(由Project-> C / C ++-> Output Files-> Assembly with Source Code(/ FAs)在Assembler Output字段下启用)

; Listing generated by Microsoft (R) Optimizing Compiler Version 19.15.26732.1 

    TITLE   c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
    .686P
    .XMM
    include listing.inc
    .model  flat

INCLUDELIB OLDNAMES

EXTRN   __imp____std_terminate:PROC
EXTRN   @__security_check_cookie@4:PROC
EXTRN   __imp____CxxFrameHandler3:PROC
PUBLIC  ??$?RABNABN@?$less@X@std@@QBE_NABN0@Z       ; std::less<void>::operator()<double const &,double const &>
PUBLIC  ??$clamp@NU?$less@X@std@@@std@@YAABNABN00U?$less@X@0@@Z ; std::clamp<double,std::less<void> >
PUBLIC  ??$clamp@N@std@@YAABNABN00@Z            ; std::clamp<double>
PUBLIC  _main
PUBLIC  ?ProcessOptimized@MyPlugin@@QAEXH@Z     ; MyPlugin::ProcessOptimized
PUBLIC  ?Process@MyPlugin@@QAEXH@Z          ; MyPlugin::Process
PUBLIC  ??1MyPlugin@@QAE@XZ             ; MyPlugin::~MyPlugin
PUBLIC  ??0MyPlugin@@QAE@XZ             ; MyPlugin::MyPlugin
PUBLIC  ?ProcessOptimized@Param@@QAEXHH@Z       ; Param::ProcessOptimized
PUBLIC  ?Process@Param@@QAEXHH@Z            ; Param::Process
PUBLIC  ??0Param@@QAE@XZ                ; Param::Param
PUBLIC  __real@3ff0000000000000
PUBLIC  __real@400921fb54442d18
PUBLIC  __real@4024000000000000
PUBLIC  __real@406fe00000000000
PUBLIC  __xmm@00000003000000020000000100000000
PUBLIC  __xmm@400921fb54442d18400921fb54442d18
PUBLIC  __xmm@406fe00000000000406fe00000000000
EXTRN   __chkstk:PROC
EXTRN   ___security_cookie:DWORD
EXTRN   __fltused:DWORD
;   COMDAT __xmm@406fe00000000000406fe00000000000
CONST   SEGMENT
__xmm@406fe00000000000406fe00000000000 DB 00H, 00H, 00H, 00H, 00H, 0e0H, 'o'
    DB  '@', 00H, 00H, 00H, 00H, 00H, 0e0H, 'o@'
CONST   ENDS
;   COMDAT __xmm@400921fb54442d18400921fb54442d18
CONST   SEGMENT
__xmm@400921fb54442d18400921fb54442d18 DB 018H, '-DT', 0fbH, '!', 09H, '@'
    DB  018H, '-DT', 0fbH, '!', 09H, '@'
CONST   ENDS
;   COMDAT __xmm@00000003000000020000000100000000
CONST   SEGMENT
__xmm@00000003000000020000000100000000 DB 00H, 00H, 00H, 00H, 01H, 00H, 00H
    DB  00H, 02H, 00H, 00H, 00H, 03H, 00H, 00H, 00H
CONST   ENDS
;   COMDAT __real@406fe00000000000
CONST   SEGMENT
__real@406fe00000000000 DQ 0406fe00000000000r   ; 255
CONST   ENDS
;   COMDAT __real@4024000000000000
CONST   SEGMENT
__real@4024000000000000 DQ 04024000000000000r   ; 10
CONST   ENDS
;   COMDAT __real@400921fb54442d18
CONST   SEGMENT
__real@400921fb54442d18 DQ 0400921fb54442d18r   ; 3.14159
CONST   ENDS
;   COMDAT __real@3ff0000000000000
CONST   SEGMENT
__real@3ff0000000000000 DQ 03ff0000000000000r   ; 1
CONST   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ??0Param@@QAE@XZ
_TEXT   SEGMENT
??0Param@@QAE@XZ PROC                   ; Param::Param, COMDAT
; _this$ = ecx

; 23   :    Param() { }

    xorps   xmm0, xmm0
    mov eax, ecx
    movsd   QWORD PTR [ecx], xmm0
    movsd   QWORD PTR [ecx+16], xmm0
    movsd   xmm0, QWORD PTR __real@4024000000000000
    movsd   QWORD PTR [ecx+32], xmm0
    movsd   xmm0, QWORD PTR __real@3ff0000000000000
    movsd   QWORD PTR [ecx+48], xmm0
    movsd   QWORD PTR [ecx+64], xmm0
    ret 0
??0Param@@QAE@XZ ENDP                   ; Param::Param
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ?Process@Param@@QAEXHH@Z
_TEXT   SEGMENT
$T1 = -24                       ; size = 8
$T3 = -16                       ; size = 8
$T2 = -8                        ; size = 8
_voiceIndex$ = 8                    ; size = 4
_blockSize$dead$ = 12                   ; size = 4
?Process@Param@@QAEXHH@Z PROC               ; Param::Process, COMDAT
; _this$ = ecx

; 25   :    inline void Process(int voiceIndex, int blockSize) {

    push    ebp
    mov ebp, esp
    sub esp, 24                 ; 00000018H

; 26   :        double *pB = b[voiceIndex];

    mov eax, DWORD PTR _voiceIndex$[ebp]
    xorps   xmm5, xmm5

; 32   :            // some other code (that will use phase, like sin(phase))
; 33   : 
; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   xmm2, QWORD PTR __real@400921fb54442d18
    push    esi
    mov esi, ecx
    shl eax, 11                 ; 0000000bH
    push    edi
    movsd   QWORD PTR $T1[ebp], xmm2
    mov ecx, 256                ; 00000100H
    movsd   QWORD PTR $T2[ebp], xmm5
    movsd   xmm3, QWORD PTR [esi+48]
    lea edx, DWORD PTR [esi+2128]
    movsd   xmm1, QWORD PTR [esi]
    add edx, eax
    mulsd   xmm3, QWORD PTR [esi+32]
    movsd   xmm4, QWORD PTR [esi+64]
    npad    11
$LL4@Process:
    movsd   xmm0, QWORD PTR [edx-2048]
    mulsd   xmm0, xmm3
    addsd   xmm0, QWORD PTR [edx]
    mulsd   xmm0, xmm4
    comisd  xmm0, xmm2
    movsd   QWORD PTR $T3[ebp], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    jbe SHORT $LN10@Process
    movaps  xmm0, xmm2
    jmp SHORT $LN11@Process
$LN10@Process:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef

; 287  :        return (static_cast<_Ty1&&>(_Left)

    comisd  xmm5, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    lea eax, DWORD PTR $T2[ebp]
    lea edi, DWORD PTR $T3[ebp]
    cmovbe  eax, edi
    movsd   xmm0, QWORD PTR [eax]
$LN11@Process:
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp

; 31   :        for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {

    add edx, 8

; 32   :            // some other code (that will use phase, like sin(phase))
; 33   : 
; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    addsd   xmm1, xmm0
    sub ecx, 1
    jne SHORT $LL4@Process

; 35   :        }
; 36   : 
; 37   :        mPhase = phase;
; 38   :    }

    pop edi
    movsd   QWORD PTR [esi], xmm1
    pop esi
    mov esp, ebp
    pop ebp
    ret 8
?Process@Param@@QAEXHH@Z ENDP               ; Param::Process
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ?ProcessOptimized@Param@@QAEXHH@Z
_TEXT   SEGMENT
_v_phase$ = -16                     ; size = 16
_voiceIndex$ = 8                    ; size = 4
_blockSize$dead$ = 12                   ; size = 4
?ProcessOptimized@Param@@QAEXHH@Z PROC          ; Param::ProcessOptimized, COMDAT
; _this$ = ecx

; 39   :    inline void ProcessOptimized(int voiceIndex, int blockSize) {

    push    ebx
    mov ebx, esp
    sub esp, 8
    and esp, -16                ; fffffff0H
    add esp, 4
    push    ebp
    mov ebp, DWORD PTR [ebx+4]
    mov DWORD PTR [esp+4], ebp
    mov ebp, esp

; 40   :        double *pB = b[voiceIndex];

    mov eax, DWORD PTR _voiceIndex$[ebx]
    mov edx, ecx
    shl eax, 11                 ; 0000000bH
    xorps   xmm3, xmm3
    xorps   xmm2, xmm2
    sub esp, 16                 ; 00000010H
    xorps   xmm7, xmm7
    mov ecx, 128                ; 00000080H

; 41   :        double *pC = c[voiceIndex];
; 42   :        double phase = mPhaseOptimized;
; 43   :        double bp0 = mNoteFrequency * mHostPitch;

    movsd   xmm5, QWORD PTR [edx+48]
    mulsd   xmm5, QWORD PTR [edx+32]

; 44   : 
; 45   :        __m128d v_boundLower = _mm_set1_pd(0.0);
; 46   :        __m128d v_boundUpper = _mm_set1_pd(PI);
; 47   :        __m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);

    movsd   xmm6, QWORD PTR [edx+64]

; 48   :        __m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
; 49   : 
; 50   :        __m128d v_pB0 = _mm_load_pd(pB);
; 51   :        v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 52   :        __m128d v_pC0 = _mm_load_pd(pC);
; 53   :        v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 54   : 
; 55   :        __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);

    movsd   xmm0, QWORD PTR [eax+edx+80]
    movups  xmm4, XMMWORD PTR [eax+edx+80]
    movups  xmm1, XMMWORD PTR [eax+edx+2128]
    mulsd   xmm5, xmm6
    unpcklpd xmm3, xmm0

; 56   :        v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57   :        __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);

    movsd   xmm0, QWORD PTR [eax+edx+2128]
    add eax, 2136               ; 00000858H
    unpcklpd xmm2, xmm0
    add eax, edx

; 58   :        v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
; 59   : 
; 60   :        __m128d v_phaseAcc1;
; 61   :        __m128d v_phaseAcc2;
; 62   :        __m128d v_phase = _mm_set1_pd(phase);

    movsd   xmm0, QWORD PTR [edx+16]
    unpcklpd xmm5, xmm5
    unpcklpd xmm6, xmm6
    mulpd   xmm4, xmm5
    mulpd   xmm1, xmm6
    mulpd   xmm3, xmm5
    mulpd   xmm2, xmm6
    unpcklpd xmm0, xmm0
    npad    2
$LL4@ProcessOpt:

; 63   : 
; 64   :        for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
; 65   :            // some other code (that will use phase, like sin(phase))
; 66   : 
; 67   :            v_phaseAcc1 = _mm_add_pd(v_pB0, v_pC0);

    addpd   xmm1, xmm4

; 68   :            v_phaseAcc1 = _mm_max_pd(v_phaseAcc1, v_boundLower);
; 69   :            v_phaseAcc1 = _mm_min_pd(v_phaseAcc1, v_boundUpper);
; 70   :            v_phaseAcc2 = _mm_add_pd(v_pB1, v_pC1);
; 71   :            v_phaseAcc2 = _mm_max_pd(v_phaseAcc2, v_boundLower);
; 72   :            v_phaseAcc2 = _mm_min_pd(v_phaseAcc2, v_boundUpper);
; 73   :            v_phase = _mm_add_pd(v_phase, v_phaseAcc1);
; 74   :            v_phase = _mm_add_pd(v_phase, v_phaseAcc2);
; 75   : 
; 76   :            v_pB0 = _mm_load_pd(pB + 2);

    movups  xmm4, XMMWORD PTR [eax-2040]
    addpd   xmm2, xmm3

; 77   :            v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 78   :            v_pC0 = _mm_load_pd(pC + 2);
; 79   :            v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 80   : 
; 81   :            v_pB1 = _mm_loadu_pd(pB + 1);

    movups  xmm3, XMMWORD PTR [eax-2048]
    maxpd   xmm1, xmm7
    maxpd   xmm2, xmm7
    minpd   xmm1, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
    minpd   xmm2, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
    addpd   xmm0, xmm1
    movups  xmm1, XMMWORD PTR [eax+8]
    addpd   xmm0, xmm2

; 82   :            v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 83   :            v_pC1 = _mm_loadu_pd(pC + 1);

    movups  xmm2, XMMWORD PTR [eax]
    add eax, 16                 ; 00000010H
    movaps  XMMWORD PTR _v_phase$[ebp], xmm0
    mulpd   xmm4, xmm5
    mulpd   xmm1, xmm6
    mulpd   xmm3, xmm5

; 84   :            v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);

    mulpd   xmm2, xmm6
    sub ecx, 1
    jne SHORT $LL4@ProcessOpt

; 85   :        }
; 86   : 
; 87   :        mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];

    movsd   xmm0, QWORD PTR _v_phase$[ebp+8]
    movsd   QWORD PTR [edx+16], xmm0

; 88   :    }

    mov esp, ebp
    pop ebp
    mov esp, ebx
    pop ebx
    ret 8
?ProcessOptimized@Param@@QAEXHH@Z ENDP          ; Param::ProcessOptimized
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ??0MyPlugin@@QAE@XZ
_TEXT   SEGMENT
??0MyPlugin@@QAE@XZ PROC                ; MyPlugin::MyPlugin, COMDAT
; _this$ = ecx

; 97   :        // fill b
; 98   :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    movaps  xmm2, XMMWORD PTR __xmm@00000003000000020000000100000000
    xorps   xmm0, xmm0
    movaps  xmm3, XMMWORD PTR __xmm@406fe00000000000406fe00000000000
    xor edx, edx
    push    esi
    mov esi, ecx
    push    edi

; 14   :    alignas(16) double mPhase = 0.0;

    movsd   QWORD PTR [esi], xmm0

; 97   :        // fill b
; 98   :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    lea ecx, DWORD PTR [esi+88]

; 15   :    alignas(16) double mPhaseOptimized = 0.0;

    movsd   QWORD PTR [esi+16], xmm0

; 16   :    alignas(16) double mNoteFrequency = 10.0;

    movsd   xmm0, QWORD PTR __real@4024000000000000
    movsd   QWORD PTR [esi+32], xmm0

; 17   :    alignas(16) double mHostPitch = 1.0;

    movsd   xmm0, QWORD PTR __real@3ff0000000000000
    movsd   QWORD PTR [esi+48], xmm0

; 18   :    alignas(16) double mRadiansPerSample = 1.0;

    movsd   QWORD PTR [esi+64], xmm0
$LL7@MyPlugin:

; 100  :                double value = (sampleIndex / ((double)bufferSize - 1));

    movd    xmm0, edx
    lea eax, DWORD PTR [edx+2]
    pshufd  xmm1, xmm0, 0
    lea ecx, DWORD PTR [ecx+32]
    movq    xmm0, xmm2
    add edx, 4
    paddd   xmm1, xmm0
    cvtdq2pd xmm0, xmm1
    divpd   xmm0, xmm3

; 101  : 
; 102  :                mParam1.b[voiceIndex][sampleIndex] = value;

    movlpd  QWORD PTR [ecx-40], xmm0
    movhpd  QWORD PTR [ecx-32], xmm0
    movd    xmm0, eax
    pshufd  xmm1, xmm0, 0
    movq    xmm0, xmm2
    paddd   xmm1, xmm0
    cvtdq2pd xmm0, xmm1
    divpd   xmm0, xmm3
    movlpd  QWORD PTR [ecx-24], xmm0
    movhpd  QWORD PTR [ecx-16], xmm0
    cmp edx, 256                ; 00000100H
    jl  SHORT $LL7@MyPlugin

; 103  :            }
; 104  :        }
; 105  : 
; 106  :        // fill c
; 107  :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 108  :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    lea edi, DWORD PTR [esi+2128]
    xor eax, eax
    mov ecx, 512                ; 00000200H
    rep stosd

; 109  :                double value = 0.0;
; 110  : 
; 111  :                mParam1.c[voiceIndex][sampleIndex] = value;
; 112  :            }
; 113  :        }
; 114  :    }

    pop edi
    mov eax, esi
    pop esi
    ret 0
??0MyPlugin@@QAE@XZ ENDP                ; MyPlugin::MyPlugin
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ??1MyPlugin@@QAE@XZ
_TEXT   SEGMENT
??1MyPlugin@@QAE@XZ PROC                ; MyPlugin::~MyPlugin, COMDAT
; _this$dead$ = ecx

; 115  :    ~MyPlugin() { }

    ret 0
??1MyPlugin@@QAE@XZ ENDP                ; MyPlugin::~MyPlugin
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ?Process@MyPlugin@@QAEXH@Z
_TEXT   SEGMENT
$T2 = -28                       ; size = 8
$T4 = -20                       ; size = 8
$T3 = -12                       ; size = 8
_blockSize$dead$ = 8                    ; size = 4
?Process@MyPlugin@@QAEXH@Z PROC             ; MyPlugin::Process, COMDAT
; _this$ = ecx

; 117  :    void Process(int blockSize) {

    push    ebp
    mov ebp, esp
    sub esp, 28                 ; 0000001cH

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   xmm2, QWORD PTR __real@400921fb54442d18
    xorps   xmm5, xmm5

; 117  :    void Process(int blockSize) {

    push    esi
    mov esi, ecx

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   QWORD PTR $T2[ebp], xmm2

; 117  :    void Process(int blockSize) {

    push    edi

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   QWORD PTR $T3[ebp], xmm5
    mov edx, 256                ; 00000100H
    movsd   xmm3, QWORD PTR [esi+48]

; 27   :        double *pC = c[voiceIndex];

    lea ecx, DWORD PTR [esi+2128]

; 28   :        double phase = mPhase;
; 29   :        double bp0 = mNoteFrequency * mHostPitch;

    movsd   xmm1, QWORD PTR [esi]
    mulsd   xmm3, QWORD PTR [esi+32]
    movsd   xmm4, QWORD PTR [esi+64]
    npad    3
$LL9@Process:

; 32   :            // some other code (that will use phase, like sin(phase))
; 33   : 
; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   xmm0, QWORD PTR [ecx-2048]
    mulsd   xmm0, xmm3
    addsd   xmm0, QWORD PTR [ecx]
    mulsd   xmm0, xmm4
    comisd  xmm0, xmm2
    movsd   QWORD PTR $T4[ebp], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    jbe SHORT $LN15@Process
    movaps  xmm0, xmm2
    jmp SHORT $LN16@Process
$LN15@Process:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef

; 287  :        return (static_cast<_Ty1&&>(_Left)

    comisd  xmm5, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    lea eax, DWORD PTR $T3[ebp]
    lea edi, DWORD PTR $T4[ebp]
    cmovbe  eax, edi
    movsd   xmm0, QWORD PTR [eax]
$LN16@Process:
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp

; 31   :        for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {

    add ecx, 8

; 32   :            // some other code (that will use phase, like sin(phase))
; 33   : 
; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    addsd   xmm1, xmm0
    sub edx, 1
    jne SHORT $LL9@Process

; 118  :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 119  :            mParam1.Process(voiceIndex, blockSize);
; 120  :        }
; 121  :    }

    pop edi

; 37   :        mPhase = phase;

    movsd   QWORD PTR [esi], xmm1

; 118  :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 119  :            mParam1.Process(voiceIndex, blockSize);
; 120  :        }
; 121  :    }

    pop esi
    mov esp, ebp
    pop ebp
    ret 4
?Process@MyPlugin@@QAEXH@Z ENDP             ; MyPlugin::Process
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT ?ProcessOptimized@MyPlugin@@QAEXH@Z
_TEXT   SEGMENT
_v_phase$31 = -16                   ; size = 16
_blockSize$dead$ = 8                    ; size = 4
?ProcessOptimized@MyPlugin@@QAEXH@Z PROC        ; MyPlugin::ProcessOptimized, COMDAT
; _this$ = ecx

; 122  :    void ProcessOptimized(int blockSize) {

    push    ebx
    mov ebx, esp
    sub esp, 8
    and esp, -16                ; fffffff0H
    add esp, 4
    push    ebp
    mov ebp, DWORD PTR [ebx+4]
    mov DWORD PTR [esp+4], ebp
    mov ebp, esp
    mov edx, ecx
    xorps   xmm3, xmm3
    xorps   xmm2, xmm2
    sub esp, 16                 ; 00000010H

; 40   :        double *pB = b[voiceIndex];

    mov ecx, 128                ; 00000080H
    movsd   xmm6, QWORD PTR [edx+48]
    lea eax, DWORD PTR [edx+2136]
    mulsd   xmm6, QWORD PTR [edx+32]

; 41   :        double *pC = c[voiceIndex];
; 42   :        double phase = mPhaseOptimized;
; 43   :        double bp0 = mNoteFrequency * mHostPitch;
; 44   : 
; 45   :        __m128d v_boundLower = _mm_set1_pd(0.0);
; 46   :        __m128d v_boundUpper = _mm_set1_pd(PI);
; 47   :        __m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);

    movsd   xmm7, QWORD PTR [edx+64]

; 54   : 
; 55   :        __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);

    movsd   xmm0, QWORD PTR [edx+80]
    movsd   xmm5, QWORD PTR [edx+16]
    movups  xmm4, XMMWORD PTR [edx+80]
    movups  xmm1, XMMWORD PTR [edx+2128]
    mulsd   xmm6, xmm7
    unpcklpd xmm3, xmm0

; 57   :        __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);

    movsd   xmm0, QWORD PTR [edx+2128]
    unpcklpd xmm7, xmm7
    unpcklpd xmm6, xmm6
    unpcklpd xmm2, xmm0
    xorps   xmm0, xmm0

; 48   :        __m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
; 49   : 
; 50   :        __m128d v_pB0 = _mm_load_pd(pB);
; 51   :        v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);

    mulpd   xmm4, xmm6

; 52   :        __m128d v_pC0 = _mm_load_pd(pC);
; 53   :        v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);

    mulpd   xmm1, xmm7

; 56   :        v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);

    mulpd   xmm3, xmm6

; 58   :        v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);

    mulpd   xmm2, xmm7

; 59   : 
; 60   :        __m128d v_phaseAcc1;
; 61   :        __m128d v_phaseAcc2;
; 62   :        __m128d v_phase = _mm_set1_pd(phase);

    unpcklpd xmm5, xmm5
    npad    13
$LL9@ProcessOpt:

; 63   : 
; 64   :        for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
; 65   :            // some other code (that will use phase, like sin(phase))
; 66   : 
; 67   :            v_phaseAcc1 = _mm_add_pd(v_pB0, v_pC0);

    addpd   xmm1, xmm4

; 68   :            v_phaseAcc1 = _mm_max_pd(v_phaseAcc1, v_boundLower);
; 69   :            v_phaseAcc1 = _mm_min_pd(v_phaseAcc1, v_boundUpper);
; 70   :            v_phaseAcc2 = _mm_add_pd(v_pB1, v_pC1);
; 71   :            v_phaseAcc2 = _mm_max_pd(v_phaseAcc2, v_boundLower);
; 72   :            v_phaseAcc2 = _mm_min_pd(v_phaseAcc2, v_boundUpper);
; 73   :            v_phase = _mm_add_pd(v_phase, v_phaseAcc1);
; 74   :            v_phase = _mm_add_pd(v_phase, v_phaseAcc2);
; 75   : 
; 76   :            v_pB0 = _mm_load_pd(pB + 2);

    movups  xmm4, XMMWORD PTR [eax-2040]
    addpd   xmm2, xmm3

; 77   :            v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 78   :            v_pC0 = _mm_load_pd(pC + 2);
; 79   :            v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 80   : 
; 81   :            v_pB1 = _mm_loadu_pd(pB + 1);

    movups  xmm3, XMMWORD PTR [eax-2048]
    maxpd   xmm1, xmm0
    maxpd   xmm2, xmm0
    minpd   xmm1, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
    minpd   xmm2, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
    addpd   xmm5, xmm1
    movups  xmm1, XMMWORD PTR [eax+8]
    addpd   xmm5, xmm2

; 82   :            v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 83   :            v_pC1 = _mm_loadu_pd(pC + 1);

    movups  xmm2, XMMWORD PTR [eax]
    add eax, 16                 ; 00000010H
    movaps  XMMWORD PTR _v_phase$31[ebp], xmm5
    mulpd   xmm4, xmm6
    mulpd   xmm1, xmm7
    mulpd   xmm3, xmm6

; 84   :            v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);

    mulpd   xmm2, xmm7
    sub ecx, 1
    jne SHORT $LL9@ProcessOpt

; 85   :        }
; 86   : 
; 87   :        mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];

    movsd   xmm0, QWORD PTR _v_phase$31[ebp+8]
    movsd   QWORD PTR [edx+16], xmm0

; 123  :        for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 124  :            mParam1.ProcessOptimized(voiceIndex, blockSize);
; 125  :        }
; 126  :    }

    mov esp, ebp
    pop ebp
    mov esp, ebx
    pop ebx
    ret 4
?ProcessOptimized@MyPlugin@@QAEXH@Z ENDP        ; MyPlugin::ProcessOptimized
_TEXT   ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
;   COMDAT _main
_TEXT   SEGMENT
_counterProcessing$1$ = -4304               ; size = 4
_counterProcessing$ = -4304             ; size = 8
_bp0$1$ = -4296                     ; size = 8
_v_radiansPerSample$1$ = -4288              ; size = 16
$T3 = -4264                     ; size = 8
_v_phase$38 = -4256                 ; size = 16
$T4 = -4256                     ; size = 8
$T2 = -4232                     ; size = 8
tv1040 = -4224                      ; size = 16
tv1039 = -4208                      ; size = 16
_myPlugin$ = -4192                  ; size = 4176
__$ArrayPad$ = -4                   ; size = 4
_main   PROC                        ; COMDAT

; 129  : int main() {

    push    ebp
    mov ebp, esp
    and esp, -16                ; fffffff0H
    mov eax, 4312               ; 000010d8H
    call    __chkstk
    mov eax, DWORD PTR ___security_cookie
    xor eax, esp
    mov DWORD PTR __$ArrayPad$[esp+4312], eax

; 16   :    alignas(16) double mNoteFrequency = 10.0;

    movsd   xmm0, QWORD PTR __real@4024000000000000

; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    lea ecx, DWORD PTR _myPlugin$[esp+4392]
    movsd   xmm1, QWORD PTR __real@406fe00000000000
    xorps   xmm2, xmm2

; 16   :    alignas(16) double mNoteFrequency = 10.0;

    movsd   QWORD PTR _myPlugin$[esp+4344], xmm0

; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    xor eax, eax

; 17   :    alignas(16) double mHostPitch = 1.0;

    movsd   xmm0, QWORD PTR __real@3ff0000000000000

; 129  : int main() {

    push    esi
    push    edi

; 14   :    alignas(16) double mPhase = 0.0;

    movsd   QWORD PTR _myPlugin$[esp+4320], xmm2

; 15   :    alignas(16) double mPhaseOptimized = 0.0;

    movsd   QWORD PTR _myPlugin$[esp+4336], xmm2

; 17   :    alignas(16) double mHostPitch = 1.0;

    movsd   QWORD PTR _myPlugin$[esp+4368], xmm0

; 18   :    alignas(16) double mRadiansPerSample = 1.0;

    movsd   QWORD PTR _myPlugin$[esp+4384], xmm0
$LL11@main:
    movd    xmm0, eax

; 99   :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    lea ecx, DWORD PTR [ecx+8]

; 100  :                double value = (sampleIndex / ((double)bufferSize - 1));

    cvtdq2pd xmm0, xmm0
    inc eax
    divsd   xmm0, xmm1

; 101  : 
; 102  :                mParam1.b[voiceIndex][sampleIndex] = value;

    movsd   QWORD PTR [ecx-8], xmm0
    cmp eax, 256                ; 00000100H
    jl  SHORT $LL11@main

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   xmm6, QWORD PTR __real@400921fb54442d18

; 108  :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    lea edi, DWORD PTR _myPlugin$[esp+6448]
    mov ecx, 512                ; 00000200H

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   QWORD PTR $T2[esp+4320], xmm6

; 108  :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    xor eax, eax

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   QWORD PTR $T3[esp+4320], xmm2

; 108  :            for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

    rep stosd
    movsd   xmm3, QWORD PTR _myPlugin$[esp+4352]
    xorps   xmm0, xmm0
    mulsd   xmm3, QWORD PTR _myPlugin$[esp+4368]

; 55   :        __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);

    movaps  xmm4, xmm2
    movsd   xmm1, QWORD PTR _myPlugin$[esp+4384]

; 56   :        v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57   :        __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);

    movsd   xmm5, QWORD PTR _myPlugin$[esp+4336]

; 130  :    MyPlugin myPlugin;
; 131  : 
; 132  :    long long numProcessing = 5;
; 133  :    long long counterProcessing = 0;

    movlpd  QWORD PTR _counterProcessing$[esp+4320], xmm0

; 55   :        __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);

    movsd   xmm0, QWORD PTR _myPlugin$[esp+4400]
    movaps  xmm7, xmm3
    mulsd   xmm7, QWORD PTR _myPlugin$[esp+4384]

; 56   :        v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57   :        __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);

    mov edi, DWORD PTR _counterProcessing$[esp+4324]
    mov esi, DWORD PTR _counterProcessing$[esp+4320]
    unpcklpd xmm4, xmm0
    movsd   xmm0, QWORD PTR _myPlugin$[esp+6448]
    movups  XMMWORD PTR tv1040[esp+4320], xmm4
    movaps  xmm4, xmm2
    unpcklpd xmm1, xmm1
    unpcklpd xmm4, xmm0
    movups  XMMWORD PTR tv1039[esp+4320], xmm4
    movsd   xmm4, QWORD PTR _myPlugin$[esp+4320]
    movsd   QWORD PTR _bp0$1$[esp+4320], xmm3
    unpcklpd xmm7, xmm7
    movaps  XMMWORD PTR _v_radiansPerSample$1$[esp+4320], xmm1
    npad    8
$LL2@main:

; 134  : 
; 135  :    // I'll only process once block, just for analysis
; 136  :    while (counterProcessing++ < numProcessing) {

    add esi, 1

; 26   :        double *pB = b[voiceIndex];

    lea ecx, DWORD PTR _myPlugin$[esp+6448]

; 134  : 
; 135  :    // I'll only process once block, just for analysis
; 136  :    while (counterProcessing++ < numProcessing) {

    mov DWORD PTR _counterProcessing$1$[esp+4320], esi

; 26   :        double *pB = b[voiceIndex];

    mov edx, 256                ; 00000100H

; 134  : 
; 135  :    // I'll only process once block, just for analysis
; 136  :    while (counterProcessing++ < numProcessing) {

    adc edi, 0
    npad    10
$LL29@main:

; 34   :            phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

    movsd   xmm0, QWORD PTR [ecx-2048]
    mulsd   xmm0, xmm3
    addsd   xmm0, QWORD PTR [ecx]
    mulsd   xmm0, QWORD PTR _myPlugin$[esp+4384]
    comisd  xmm0, xmm6
    movsd   QWORD PTR $T4[esp+4320], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    jbe SHORT $LN35@main
    movaps  xmm0, xmm6
    jmp SHORT $LN36@main
$LN35@main:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef

; 287  :        return (static_cast<_Ty1&&>(_Left)

    comisd  xmm2, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 :    return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

    lea eax, DWORD PTR $T3[esp+4320]
    lea esi, DWORD PTR $T4[esp+4320]
    cmovbe  eax, esi
    movsd   xmm0, QWORD PTR [eax]

// ...

注意:由于StackOverflow限制了它,我删除了一些行。)

完全不同。另外,我看到VS生成的代码有点多余,即搜索字符串phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);:有很多。

我缺少哪些设置?我已经在X86构建上匹配了相同的MSVC版本(19.15),同时也进行了实际的优化。

2 个答案:

答案 0 :(得分:2)

似乎您没有使用相同的编译器标志。 Visual Studio的程序集转储显示,每个功能都使用标志/Ogtp进行了优化,该标志在您在命令行中指定/Og时在内部使用。另一方面,在Godbolt版本中,您使用的/Ot /O2在内部对应于/Ogtpy。如果我手动添加/Oy标志,代码将略有不同,但仍与Visual Studio生成的代码不同。

我意识到编译器版本并不完全相同,但是19.15.26726.0和19.15.26732.1之间的差异很小,可能只包含错误修复。我认为还有其他不同的标志。您可以转到项目的属性页,并在“所有选项”和“其他选项”窗格中找到所有已使用的编译器选项。在发布版本中,除了/arch:SSE2 /Ot /O2以外还使用了许多其他选项。注意/arch:SSE2 is the default,因此您不必显式指定它。另外,/O2 implies /Ot。因此/arch:SSE2 /Ot /O2等同于/O2

答案 1 :(得分:0)

有多个到达目的地的路径。

Roger Orr在编译过程中给出了很好的talk at an ACCU conference。例如,一个简单的“ hello world”将在GCC中生成98行asm,在MSVC中生成6,704行。

要快速且简单地回答您的问题:在Godbolt链接中,版本为19.15.26726.0,而main.asm文件为19.15.26732.1

关闭,但也许足以改变这种状况?

19.15.26726.0

19.15.26732.1

MSVC特别奇怪,您可以在GCC中输出asm,然后使用该asm再次通过GCC并获得相同的机器代码。您不能使用MSVC。因此,也许如果版本完全相同,您仍然会获得不同的asm,这将是一个有趣的实验,this article向您展示了如何在Visual Studio中并行运行两个不同版本的MSVC。