英特尔编译器与GCC代码生成的差异

时间:2013-06-01 04:56:17

标签: optimization gcc compiler-construction intel

我正在学习x64编程以及英特尔C ++编译器和GCC之间的差异以及它们如何优化指令

问题:

  1. 告诉英特尔编译器转储汇编代码(类似于gcc -S)的最佳方法是什么? 现在我在Visual Studio中调试和反汇编以查看说明。

  2. 反汇编的英特尔编译psum1不遵守传递寄存器rdi,rsi,rdx,rcx,r8,r9的参数约定,就像在GCC汇编器输出中看到的那样。我在这里想念的是什么?

  3. 出于某种原因,英特尔编译器没有优化内存访问,我需要更改哪些设置?

                //intel compiler /Ox output
                p[i] = p[i-1] + a[i];
                000000013F79118B  movss       xmm1,dword ptr [rcx+rax*4+8]
                000000013F791191  addss       xmm0,dword ptr [rcx+rax*4+4]
                000000013F791197  movss       dword ptr [rdx+rax*4+4],xmm0
                000000013F79119D  addss       xmm0,xmm1
                000000013F7911A1  movss       dword ptr [rdx+rax*4+8],xmm0
    
    //GCC -O3 ouput
    LBB1_3:
    decq    %rdx
    LBB1_2:
    addq    $4, %rsi
            addq    $4, %rdi
            addss   (%rdi), %xmm0
            movss   %xmm0, (%rsi)
    testq   %rdx, %rdx
            jne LBB1_3
            LBB1_4:
    
  4. 原始C代码

    void psum1( float a[], float p[], long int n ) {
        long int i;
        p[0] = a[0];
        for (i=1; i<n; i++) {
            p[i] = p[i-1] + a[i];
        }
    }
    

    在Visual Studio 2010上从英特尔C ++编译器2013进行反汇编:

    • 完全优化/ Ox
    • 启用内在功能/ Oi
    • 赞成速度/ Ot

      void psum1(float a [],float p [],long int n){     long int i;

      p[0] = a[0];
      000000013F791156  movss       xmm0,dword ptr [rcx]
      000000013F79115A  mov         dword ptr [rdx],eax
      
      for( i=1; i<n; i++ ) {
          000000013F79115C  jle         psum1+7Ah (13F7911CAh)
          000000013F79115E  mov         eax,1
          000000013F791163  lea         r10d,[r8-1]
          000000013F791167  mov         r11d,r10d
          000000013F79116A  xor         r9d,r9d
          000000013F79116D  shr         r11d,1Fh
          000000013F791171  lea         r8d,[r11+r8-1]
          000000013F791176  sar         r8d,1
          000000013F791179  test        r8d,r8d
          000000013F79117C  jbe         psum1+5Eh (13F7911AEh)
      
          p[i] = p[i-1] + a[i];
          000000013F79117E  lea         eax,[r9+r9]
      
          for( i=1; i<n; i++ ) {
              000000013F791182  inc         r9d
      
              p[i] = p[i-1] + a[i];
              000000013F791185  movsxd      rax,eax
      
              for( i=1; i<n; i++ ) {
                  000000013F791188  cmp         r9d,r8d
      
                  p[i] = p[i-1] + a[i];
                  000000013F79118B  movss       xmm1,dword ptr [rcx+rax*4+8]
                  000000013F791191  addss       xmm0,dword ptr [rcx+rax*4+4]
                  000000013F791197  movss       dword ptr [rdx+rax*4+4],xmm0
                  000000013F79119D  addss       xmm0,xmm1
                  000000013F7911A1  movss       dword ptr [rdx+rax*4+8],xmm0
      
                  for( i=1; i<n; i++ ) {
                      000000013F7911A7  jb          psum1+2Eh (13F79117Eh)
                      000000013F7911A9  lea         eax,[r9+r9+1]
                      000000013F7911AE  lea         r8d,[rax-1]
                      000000013F7911B2  cmp         r10d,r8d
                      000000013F7911B5  jbe         psum1+7Ah (13F7911CAh)
      
                      p[i] = p[i-1] + a[i];
                      000000013F7911B7  movsxd      rax,eax
                      000000013F7911BA  movss       xmm0,dword ptr [rdx+rax*4-4]
                      000000013F7911C0  addss       xmm0,dword ptr [rcx+rax*4]
                      000000013F7911C5  movss       dword ptr [rdx+rax*4],xmm0
                  }
              }
              000000013F7911CA  ret
              000000013F7911CB  nop         dword ptr [rax+rax]
      

    GCC装配输出完全优化-O3

    .section    __TEXT,__text,regular,pure_instructions
    .globl  _psum1
    .align  4, 0x90
    _psum1:
    Leh_func_begin1:
    pushq   %rbp
            Ltmp0:
    movq    %rsp, %rbp
            Ltmp1:
    movss   (%rdi), %xmm0
            movss   %xmm0, (%rsi)
    cmpq    $2, %rdx
            jl  LBB1_4
            addq    $-2, %rdx
            jmp LBB1_2
    .align  4, 0x90
    LBB1_3:
    decq    %rdx
    LBB1_2:
    addq    $4, %rsi
            addq    $4, %rdi
            addss   (%rdi), %xmm0
            movss   %xmm0, (%rsi)
    testq   %rdx, %rdx
            jne LBB1_3
    LBB1_4:
    popq    %rbp
            ret
    Leh_func_end1:
    

0 个答案:

没有答案