我编写了一种方法,该方法应该将向量与矩阵相乘,然后将结果写入输出向量,如下所示:
void __attribute__ ((optimize ("-O3", "-ftree-vectorize" )))
myMethod ( double * matrix , double const * vectorIn, double * vectorOut )
{
int numVertices = 1000;
for ( int v = 0; v < numVertices; ++v )
{
double const * inVertex = vectorIn;
double * outVertex = vectorOut;
for ( int i = 0;i < 4; i++ )
{
outVertex [ i ] = ( matrix [ i ] ) * inVertex[ 0 ]
+ ( matrix [ i + 1 ] ) * inVertex[ 1 ]
+ ( matrix [ i + 2 ] ) * inVertex[ 2 ]
+ ( matrix [ i + 3 ] ) * inVertex[ 3 ];
}
}
}
尽管我使用O3进行了编译并进行了ftree-vectorize,但它不会产生霓虹灯指令:
0x00000000004006c0 <+0>: mov w3, #0x3e8 // #1000
0x00000000004006c4 <+4>: ldp d6, d7, [x1]
0x00000000004006c8 <+8>: subs w3, w3, #0x1
0x00000000004006cc <+12>: ldp d2, d3, [x0]
0x00000000004006d0 <+16>: ldp d5, d4, [x1,#16]
0x00000000004006d4 <+20>: ldp d1, d0, [x0,#16]
0x00000000004006d8 <+24>: fmul d3, d3, d7
0x00000000004006dc <+28>: fmadd d2, d2, d6, d3
0x00000000004006e0 <+32>: fmadd d1, d1, d5, d2
0x00000000004006e4 <+36>: fmadd d0, d0, d4, d1
0x00000000004006e8 <+40>: str d0, [x2]
0x00000000004006ec <+44>: ldp d6, d7, [x1]
0x00000000004006f0 <+48>: ldp d2, d3, [x0,#8]
0x00000000004006f4 <+52>: ldp d5, d4, [x1,#16]
0x00000000004006f8 <+56>: ldp d1, d0, [x0,#24]
0x00000000004006fc <+60>: fmul d3, d3, d7
0x0000000000400700 <+64>: fmadd d2, d2, d6, d3
0x0000000000400704 <+68>: fmadd d1, d1, d5, d2
0x0000000000400708 <+72>: fmadd d0, d0, d4, d1
0x000000000040070c <+76>: str d0, [x2,#8]
0x0000000000400710 <+80>: ldp d6, d7, [x1]
0x0000000000400714 <+84>: ldp d2, d3, [x0,#16]
0x0000000000400718 <+88>: ldp d1, d0, [x0,#32]
0x000000000040071c <+92>: ldp d5, d4, [x1,#16]
0x0000000000400720 <+96>: fmul d3, d3, d7
0x0000000000400724 <+100>: fmadd d2, d2, d6, d3
0x0000000000400728 <+104>: fmadd d1, d1, d5, d2
0x000000000040072c <+108>: fmadd d0, d0, d4, d1
0x0000000000400730 <+112>: str d0, [x2,#16]
0x0000000000400734 <+116>: ldp d2, d3, [x0,#24]
0x0000000000400738 <+120>: ldp d6, d7, [x1]
0x000000000040073c <+124>: ldr d1, [x0,#40]
0x0000000000400740 <+128>: ldp d5, d4, [x1,#16]
0x0000000000400744 <+132>: ldr d0, [x0,#48]
0x0000000000400748 <+136>: fmul d3, d3, d7
0x000000000040074c <+140>: fmadd d2, d2, d6, d3
0x0000000000400750 <+144>: fmadd d1, d1, d5, d2
0x0000000000400754 <+148>: fmadd d0, d0, d4, d1
0x0000000000400758 <+152>: str d0, [x2,#24]
0x000000000040075c <+156>: b.ne 0x4006c4 <_Z13transformClipPdPKdS_+4>
0x0000000000400760 <+160>: ret
奇怪的是,如果将内部循环移动到单独的方法,它将产生带有霓虹灯函数调用的优化路径。
该gcc是用于arm aarch64-linux-gnu-g ++的Aarch64 gcc(Linaro GCC 6.3-2017.05)6.3.1 20170404。
有人可以解释我为什么吗?
致谢