在g ++ 4.4.7中为复杂算术生成快速汇编

时间:2017-07-21 14:38:20

标签: c++ gcc optimization compiler-optimization

我有一个非常简单的功能:

__attribute__((noinline))
void benchmark(cfloat* __restrict__ aa, cfloat* __restrict__ bb, cfloat* __restrict__ cc, cfloat* __restrict__ dd, cfloat uu, cfloat vv, size_t nn) {
    for (ssize_t ii=0; ii < nn; ii++) {
        dd[ii] = (
            aa[ii]*uu +
            bb[ii]*vv +
            cc[ii]
        );
    }
}

根据我定义cfloat对象的方式,使用g ++ 4.4.7生成非常不同的程序集。

第一次迭代,如果我这样定义我的cfloat:

struct cfloat {
    cfloat(float re, float im) : re(re), im(im) {}
    float re,im;
};

cfloat operator +(cfloat a, cfloat b) {
    return cfloat(a.re+b.re, a.im+b.im);
}

cfloat operator *(cfloat a, cfloat b) {
    return cfloat(a.re*b.re-a.im*b.im, a.re*b.im+a.im*b.re);
}

为基准函数生成此程序集(使用g ++ testcx.cc -O3 -o testcx编译:

   0x00000000004006a0 <+0>: push   %r15
   0x00000000004006a2 <+2>: test   %r8,%r8
   0x00000000004006a5 <+5>: push   %r14
   0x00000000004006a7 <+7>: push   %r13
   0x00000000004006a9 <+9>: push   %r12
   0x00000000004006ab <+11>:    push   %rbp
   0x00000000004006ac <+12>:    push   %rbx
   0x00000000004006ad <+13>:    movq   %xmm0,-0x28(%rsp)
   0x00000000004006b3 <+19>:    mov    %rdi,-0x38(%rsp)
   0x00000000004006b8 <+24>:    mov    -0x28(%rsp),%rax
   0x00000000004006bd <+29>:    movq   %xmm1,-0x28(%rsp)
   0x00000000004006c3 <+35>:    mov    -0x28(%rsp),%r9
   0x00000000004006c8 <+40>:    je     0x4008a0 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+512>
   0x00000000004006ce <+46>:    mov    %r9,%r15
   0x00000000004006d1 <+49>:    mov    %rax,%r14
   0x00000000004006d4 <+52>:    xor    %r11d,%r11d
   0x00000000004006d7 <+55>:    shr    $0x20,%r15
   0x00000000004006db <+59>:    shr    $0x20,%r14
   0x00000000004006df <+63>:    xor    %r10d,%r10d
   0x00000000004006e2 <+66>:    mov    %r15d,-0x2c(%rsp)
   0x00000000004006e7 <+71>:    xor    %ebp,%ebp
   0x00000000004006e9 <+73>:    xor    %ebx,%ebx
   0x00000000004006eb <+75>:    movss  -0x2c(%rsp),%xmm6
   0x00000000004006f1 <+81>:    mov    %r9d,-0x2c(%rsp)
   0x00000000004006f6 <+86>:    movss  -0x2c(%rsp),%xmm5
   0x00000000004006fc <+92>:    mov    %r14d,-0x2c(%rsp)
   0x0000000000400701 <+97>:    movss  -0x2c(%rsp),%xmm4
   0x0000000000400707 <+103>:   mov    %eax,-0x2c(%rsp)
   0x000000000040070b <+107>:   xor    %r13d,%r13d
   0x000000000040070e <+110>:   xor    %r12d,%r12d
   0x0000000000400711 <+113>:   movabs $0xffffffff00000000,%r9
   0x000000000040071b <+123>:   movss  -0x2c(%rsp),%xmm3
   0x0000000000400721 <+129>:   nopl   0x0(%rax)
   0x0000000000400728 <+136>:   lea    0x0(,%r13,8),%rax
   0x0000000000400730 <+144>:   movaps %xmm6,%xmm1
   0x0000000000400733 <+147>:   movaps %xmm5,%xmm7
   0x0000000000400736 <+150>:   and    $0xffffffff,%ebp
   0x0000000000400739 <+153>:   lea    (%rsi,%rax,1),%r15
   0x000000000040073d <+157>:   lea    (%rdx,%rax,1),%r14
   0x0000000000400741 <+161>:   add    -0x38(%rsp),%rax
   0x0000000000400746 <+166>:   and    $0xffffffff,%ebx
   0x0000000000400749 <+169>:   add    $0x1,%r12
   0x000000000040074d <+173>:   movss  (%r15),%xmm0
   0x0000000000400752 <+178>:   movss  0x4(%r15),%xmm2
   0x0000000000400758 <+184>:   mulss  %xmm0,%xmm1
   0x000000000040075c <+188>:   mulss  %xmm2,%xmm7
   0x0000000000400760 <+192>:   mulss  %xmm5,%xmm0
   0x0000000000400764 <+196>:   mulss  %xmm6,%xmm2
   0x0000000000400768 <+200>:   addss  %xmm7,%xmm1
   0x000000000040076c <+204>:   movaps %xmm3,%xmm7
   0x000000000040076f <+207>:   subss  %xmm2,%xmm0
   0x0000000000400773 <+211>:   movd   %xmm1,-0x30(%rsp)
   0x0000000000400779 <+217>:   mov    -0x30(%rsp),%edi
   0x000000000040077d <+221>:   movaps %xmm4,%xmm1
   0x0000000000400780 <+224>:   movd   %xmm0,-0x30(%rsp)
   0x0000000000400786 <+230>:   mov    %edi,%r15d
   0x0000000000400789 <+233>:   mov    -0x30(%rsp),%edi
   0x000000000040078d <+237>:   movss  (%rax),%xmm0
   0x0000000000400791 <+241>:   shl    $0x20,%r15
   0x0000000000400795 <+245>:   movss  0x4(%rax),%xmm2
   0x000000000040079a <+250>:   mulss  %xmm0,%xmm1
   0x000000000040079e <+254>:   or     %r15,%rbp
   0x00000000004007a1 <+257>:   mulss  %xmm2,%xmm7
   0x00000000004007a5 <+261>:   mov    %edi,%r15d
   0x00000000004007a8 <+264>:   and    %r9,%rbp
   0x00000000004007ab <+267>:   mulss  %xmm3,%xmm0
   0x00000000004007af <+271>:   or     %r15,%rbp
   0x00000000004007b2 <+274>:   mulss  %xmm4,%xmm2
   0x00000000004007b6 <+278>:   addss  %xmm7,%xmm1
   0x00000000004007ba <+282>:   subss  %xmm2,%xmm0
   0x00000000004007be <+286>:   movd   %xmm1,-0x30(%rsp)
   0x00000000004007c4 <+292>:   mov    -0x30(%rsp),%edi
   0x00000000004007c8 <+296>:   movd   %xmm0,-0x30(%rsp)
   0x00000000004007ce <+302>:   mov    %edi,%eax
   0x00000000004007d0 <+304>:   mov    -0x30(%rsp),%edi
   0x00000000004007d4 <+308>:   shl    $0x20,%rax
   0x00000000004007d8 <+312>:   or     %rax,%rbx
   0x00000000004007db <+315>:   and    %r9,%rbx
   0x00000000004007de <+318>:   mov    %edi,%eax
   0x00000000004007e0 <+320>:   or     %rax,%rbx
   0x00000000004007e3 <+323>:   mov    %r10,%rax
   0x00000000004007e6 <+326>:   mov    %rbx,%rdi
   0x00000000004007e9 <+329>:   and    $0xffffffff,%eax
   0x00000000004007ec <+332>:   shr    $0x20,%rdi
   0x00000000004007f0 <+336>:   mov    %edi,-0x20(%rsp)
   0x00000000004007f4 <+340>:   mov    %rbp,%rdi
   0x00000000004007f7 <+343>:   shr    $0x20,%rdi
   0x00000000004007fb <+347>:   movss  -0x20(%rsp),%xmm0
   0x0000000000400801 <+353>:   mov    %edi,-0x10(%rsp)
   0x0000000000400805 <+357>:   addss  -0x10(%rsp),%xmm0
   0x000000000040080b <+363>:   mov    %ebp,-0x10(%rsp)
   0x000000000040080f <+367>:   movss  %xmm0,-0x20(%rsp)
   0x0000000000400815 <+373>:   mov    -0x20(%rsp),%r10d
   0x000000000040081a <+378>:   mov    %ebx,-0x20(%rsp)
   0x000000000040081e <+382>:   movss  -0x20(%rsp),%xmm0
   0x0000000000400824 <+388>:   addss  -0x10(%rsp),%xmm0
   0x000000000040082a <+394>:   shl    $0x20,%r10
   0x000000000040082e <+398>:   or     %rax,%r10
   0x0000000000400831 <+401>:   and    %r9,%r10
   0x0000000000400834 <+404>:   movss  %xmm0,-0x20(%rsp)
   0x000000000040083a <+410>:   mov    -0x20(%rsp),%eax
   0x000000000040083e <+414>:   or     %rax,%r10
   0x0000000000400841 <+417>:   mov    %r11,%rax
   0x0000000000400844 <+420>:   mov    %r10,%rdi
   0x0000000000400847 <+423>:   and    $0xffffffff,%eax
   0x000000000040084a <+426>:   shr    $0x20,%rdi
   0x000000000040084e <+430>:   mov    %edi,-0x20(%rsp)
   0x0000000000400852 <+434>:   movss  -0x20(%rsp),%xmm0
   0x0000000000400858 <+440>:   addss  0x4(%r14),%xmm0
   0x000000000040085e <+446>:   movss  %xmm0,-0x20(%rsp)
   0x0000000000400864 <+452>:   mov    -0x20(%rsp),%r11d
   0x0000000000400869 <+457>:   mov    %r10d,-0x20(%rsp)
   0x000000000040086e <+462>:   movss  -0x20(%rsp),%xmm0
   0x0000000000400874 <+468>:   addss  (%r14),%xmm0
   0x0000000000400879 <+473>:   shl    $0x20,%r11
   0x000000000040087d <+477>:   or     %rax,%r11
   0x0000000000400880 <+480>:   and    %r9,%r11
   0x0000000000400883 <+483>:   movss  %xmm0,-0x20(%rsp)
   0x0000000000400889 <+489>:   mov    -0x20(%rsp),%eax
   0x000000000040088d <+493>:   or     %rax,%r11
   0x0000000000400890 <+496>:   cmp    %r8,%r12
   0x0000000000400893 <+499>:   mov    %r11,(%rcx,%r13,8)
   0x0000000000400897 <+503>:   mov    %r12,%r13
   0x000000000040089a <+506>:   jne    0x400728 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+136>
   0x00000000004008a0 <+512>:   pop    %rbx
   0x00000000004008a1 <+513>:   pop    %rbp
   0x00000000004008a2 <+514>:   pop    %r12
   0x00000000004008a4 <+516>:   pop    %r13
   0x00000000004008a6 <+518>:   pop    %r14
   0x00000000004008a8 <+520>:   pop    %r15
   0x00000000004008aa <+522>:   retq 

这是关于133条指令。

如果我像这样定义cfloat,将数组作为状态:

struct cfloat {
    cfloat(float re, float im) { ri[0] = re; ri[1] = im; }
    float ri[2];
};

cfloat operator +(cfloat a, cfloat b) {
    return cfloat(a.ri[0]+b.ri[0], a.ri[1]+b.ri[1]);
}

cfloat operator *(cfloat a, cfloat b) {
    return cfloat(a.ri[0]*b.ri[0]-a.ri[1]*b.ri[1], a.ri[0]*b.ri[1]+a.ri[1]*b.ri[0]);
}

它会生成这个程序集:

Dump of assembler code for function _Z9benchmarkP6cfloatS0_S0_S0_S_S_m:
   0x00000000004006a0 <+0>: push   %rbx
   0x00000000004006a1 <+1>: movq   %xmm0,-0x8(%rsp)
   0x00000000004006a7 <+7>: mov    -0x8(%rsp),%r9
   0x00000000004006ac <+12>:    movq   %xmm1,-0x8(%rsp)
   0x00000000004006b2 <+18>:    mov    -0x8(%rsp),%rax
   0x00000000004006b7 <+23>:    mov    %r9d,-0xc(%rsp)
   0x00000000004006bc <+28>:    shr    $0x20,%r9
   0x00000000004006c0 <+32>:    movss  -0xc(%rsp),%xmm9
   0x00000000004006c7 <+39>:    mov    %r9d,-0xc(%rsp)
   0x00000000004006cc <+44>:    movss  -0xc(%rsp),%xmm8
   0x00000000004006d3 <+51>:    mov    %eax,-0xc(%rsp)
   0x00000000004006d7 <+55>:    shr    $0x20,%rax
   0x00000000004006db <+59>:    movss  -0xc(%rsp),%xmm7
   0x00000000004006e1 <+65>:    test   %r8,%r8
   0x00000000004006e4 <+68>:    mov    %eax,-0xc(%rsp)
   0x00000000004006e8 <+72>:    movss  -0xc(%rsp),%xmm6
   0x00000000004006ee <+78>:    je     0x400796 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+246>
   0x00000000004006f4 <+84>:    xor    %eax,%eax
   0x00000000004006f6 <+86>:    xor    %r9d,%r9d
   0x00000000004006f9 <+89>:    nopl   0x0(%rax)
   0x0000000000400700 <+96>:    shl    $0x3,%rax
   0x0000000000400704 <+100>:   movaps %xmm7,%xmm0
   0x0000000000400707 <+103>:   lea    (%rsi,%rax,1),%rbx
   0x000000000040070b <+107>:   movaps %xmm6,%xmm3
   0x000000000040070e <+110>:   lea    (%rcx,%rax,1),%r10
   0x0000000000400712 <+114>:   lea    (%rdx,%rax,1),%r11
   0x0000000000400716 <+118>:   lea    (%rdi,%rax,1),%rax
   0x000000000040071a <+122>:   movss  (%rbx),%xmm1
   0x000000000040071e <+126>:   add    $0x1,%r9
   0x0000000000400722 <+130>:   movss  0x4(%rbx),%xmm5
   0x0000000000400727 <+135>:   mulss  %xmm1,%xmm0
   0x000000000040072b <+139>:   mulss  %xmm5,%xmm3
   0x000000000040072f <+143>:   movss  (%rax),%xmm2
   0x0000000000400733 <+147>:   movaps %xmm8,%xmm10
   0x0000000000400737 <+151>:   mulss  %xmm6,%xmm1
   0x000000000040073b <+155>:   movss  0x4(%rax),%xmm4
   0x0000000000400740 <+160>:   mulss  %xmm7,%xmm5
   0x0000000000400744 <+164>:   mulss  %xmm4,%xmm10
   0x0000000000400749 <+169>:   cmp    %r8,%r9
   0x000000000040074c <+172>:   mov    %r9,%rax
   0x000000000040074f <+175>:   subss  %xmm3,%xmm0
   0x0000000000400753 <+179>:   movaps %xmm2,%xmm3
   0x0000000000400756 <+182>:   mulss  %xmm9,%xmm4
   0x000000000040075b <+187>:   mulss  %xmm9,%xmm3
   0x0000000000400760 <+192>:   addss  %xmm5,%xmm1
   0x0000000000400764 <+196>:   mulss  %xmm8,%xmm2
   0x0000000000400769 <+201>:   subss  %xmm10,%xmm3
   0x000000000040076e <+206>:   addss  %xmm4,%xmm2
   0x0000000000400772 <+210>:   addss  %xmm3,%xmm0
   0x0000000000400776 <+214>:   addss  %xmm2,%xmm1
   0x000000000040077a <+218>:   addss  (%r11),%xmm0
   0x000000000040077f <+223>:   addss  0x4(%r11),%xmm1
   0x0000000000400785 <+229>:   movss  %xmm0,(%r10)
   0x000000000040078a <+234>:   movss  %xmm1,0x4(%r10)
   0x0000000000400790 <+240>:   jne    0x400700 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+96>
   0x0000000000400796 <+246>:   pop    %rbx
   0x0000000000400797 <+247>:   retq   
End of assembler dump.

这是约59条指令。而且,我的基准测试显示,第一次迭代比第二次迭代慢约3倍。

我更喜欢单独的真实/虚构字段,尤其是因为将它们作为数组似乎在某种程度上破坏了英特尔编译器中的矢量化器。

有什么方法可以说服gcc这两个类是等价的吗?

2 个答案:

答案 0 :(得分:1)

所以我不相信这一点,但如果我指定一个显式的复制构造函数,问题就会自行解决:

struct cfloat {
    cfloat(float re, float im) : re(re),   im(im)   {}
    cfloat(const cfloat& o)    : re(o.re), im(o.im) {}

    float re,im;
};

现在生成相同的程序集:

Dump of assembler code for function benchmark(cfloat*, cfloat*, cfloat*, cfloat*, cfloat, cfloat, unsigned long):
   0x0000000000400600 <+0>: mov    0x8(%rsp),%r10
   0x0000000000400605 <+5>: test   %r10,%r10
   0x0000000000400608 <+8>: je     0x4006aa <benchmark(cfloat*, cfloat*, cfloat*, cfloat*, cfloat, cfloat, unsigned long)+170>
   0x000000000040060e <+14>:    xor    %eax,%eax
   0x0000000000400610 <+16>:    movss  (%r9),%xmm8
   0x0000000000400615 <+21>:    movss  0x4(%r9),%xmm9
   0x000000000040061b <+27>:    movaps %xmm8,%xmm0
   0x000000000040061f <+31>:    movaps %xmm9,%xmm3
   0x0000000000400623 <+35>:    movss  (%rsi,%rax,8),%xmm1
   0x0000000000400628 <+40>:    movss  0x4(%rsi,%rax,8),%xmm7
   0x000000000040062e <+46>:    mulss  %xmm1,%xmm0
   0x0000000000400632 <+50>:    mulss  %xmm7,%xmm3
   0x0000000000400636 <+54>:    movss  (%r8),%xmm5
   0x000000000040063b <+59>:    movss  0x4(%r8),%xmm6
   0x0000000000400641 <+65>:    mulss  %xmm9,%xmm1
   0x0000000000400646 <+70>:    movaps %xmm6,%xmm10
   0x000000000040064a <+74>:    mulss  %xmm8,%xmm7
   0x000000000040064f <+79>:    movss  (%rdi,%rax,8),%xmm2
   0x0000000000400654 <+84>:    subss  %xmm3,%xmm0
   0x0000000000400658 <+88>:    movaps %xmm5,%xmm3
   0x000000000040065b <+91>:    movss  0x4(%rdi,%rax,8),%xmm4
   0x0000000000400661 <+97>:    mulss  %xmm2,%xmm3
   0x0000000000400665 <+101>:   addss  %xmm7,%xmm1
   0x0000000000400669 <+105>:   mulss  %xmm4,%xmm10
   0x000000000040066e <+110>:   mulss  %xmm6,%xmm2
   0x0000000000400672 <+114>:   mulss  %xmm5,%xmm4
   0x0000000000400676 <+118>:   subss  %xmm10,%xmm3
   0x000000000040067b <+123>:   addss  %xmm4,%xmm2
   0x000000000040067f <+127>:   addss  %xmm3,%xmm0
   0x0000000000400683 <+131>:   addss  %xmm2,%xmm1
   0x0000000000400687 <+135>:   addss  (%rdx,%rax,8),%xmm0
   0x000000000040068c <+140>:   addss  0x4(%rdx,%rax,8),%xmm1
   0x0000000000400692 <+146>:   movss  %xmm0,(%rcx,%rax,8)
   0x0000000000400697 <+151>:   movss  %xmm1,0x4(%rcx,%rax,8)
   0x000000000040069d <+157>:   add    $0x1,%rax
   0x00000000004006a1 <+161>:   cmp    %rax,%r10
   0x00000000004006a4 <+164>:   ja     0x400610 <benchmark(cfloat*, cfloat*, cfloat*, cfloat*, cfloat, cfloat, unsigned long)+16>
   0x00000000004006aa <+170>:   repz retq 
End of assembler dump.

在规范中找到那个

答案 1 :(得分:0)

您提到您定位红帽企业Linux,并且(在您删除的帖子中)较新的编译器版本生成更好的代码。您可以使用Developer Toolset获取更新的编译器,创建与操作系统其余部分兼容的应用程序: