使用arm-elf-gcc-4.5编写此程序-O3 -march = armv7-a -mthumb -mfpu = neon -mfloat-abi = softfp:
#include <arm_neon.h>
extern float32x4_t cross(const float32x4_t& v1, const float32x4_t& v2) {
float32x4x2_t
xxyyzz1(vzipq_f32(v1, v1)),
xxyyzz2(vzipq_f32(v2, v2));
float32x2_t
xx1(vget_low_f32(xxyyzz1.val[0])),
yy1(vget_high_f32(xxyyzz1.val[0])),
zz1(vget_low_f32(xxyyzz1.val[1])),
xx2(vget_low_f32(xxyyzz2.val[0])),
yy2(vget_high_f32(xxyyzz2.val[0])),
zz2(vget_low_f32(xxyyzz2.val[1]));
float32x2_t
x(vmls_f32(vmul_f32(yy1, zz2), zz1, yy2)),
y(vmls_f32(vmul_f32(zz1, xx2), xx1, zz2)),
z(vmls_f32(vmul_f32(xx1, yy2), yy1, xx2));
return vcombine_f32(vuzp_f32(x, y).val[0], z);
}
......这就是我得到的。请注意标有@&lt;&lt;&lt;&lt;
的两条无用说明_Z5crossRK19__simd128_float32_tS1_:
vldmia r0, {d16-d17}
vldmia r1, {d22-d23}
vmov q10, q8 @ v4sf
vmov q9, q11 @ v4sf
vzip.32 q8, q10
vzip.32 q11, q9
vmov d24, d17
vmov d21, d22
vmov d22, d23
vmul.f32 d17, d24, d18
vmul.f32 d19, d20, d21
vmls.f32 d19, d16, d18
vmls.f32 d17, d20, d22
vmul.f32 d16, d16, d22
vuzp.32 d17, d19
vmls.f32 d16, d24, d21
sub sp, sp, #80 @<<<
vswp d17, d16
vmov r0, r1, d16 @ v4sf
vmov r2, r3, d17
add sp, sp, #80 @<<<
bx
永远不会访问堆栈,但堆栈指针会递减,然后递增相同的量。为什么呢?
如果我修改原始代码以在序言的结尾和结尾的开头包含asm注释,就像这样:
#include <arm_neon.h>
extern float32x4_t cross(const float32x4_t& v1, const float32x4_t& v2) {
asm volatile("# End of prologue");
float32x4x2_t
xxyyzz1(vzipq_f32(v1, v1)),
xxyyzz2(vzipq_f32(v2, v2));
float32x2_t
xx1(vget_low_f32(xxyyzz1.val[0])),
yy1(vget_high_f32(xxyyzz1.val[0])),
zz1(vget_low_f32(xxyyzz1.val[1])),
xx2(vget_low_f32(xxyyzz2.val[0])),
yy2(vget_high_f32(xxyyzz2.val[0])),
zz2(vget_low_f32(xxyyzz2.val[1]));
float32x2_t
x(vmls_f32(vmul_f32(yy1, zz2), zz1, yy2)),
y(vmls_f32(vmul_f32(zz1, xx2), xx1, zz2)),
z(vmls_f32(vmul_f32(xx1, yy2), yy1, xx2));
float32x4_t res(vcombine_f32(vuzp_f32(x, y).val[0], z));
asm volatile("# Start of epilogue");
return res;
}
然后我的版本略有不同:
_Z5crossRK19__simd128_float32_tS1_:
sub sp, sp, #80
# End of prologue
vldmia r0, {d16-d17}
vldmia r1, {d22-d23}
vmov q10, q8 @ v4sf
vmov q9, q11 @ v4sf
vzip.32 q8, q10
vzip.32 q11, q9
vmov d24, d17
vmov d21, d22
vmov d22, d23
vmul.f32 d17, d24, d18
vmul.f32 d19, d20, d21
vmls.f32 d19, d16, d18
vmls.f32 d17, d20, d22
vmul.f32 d16, d16, d22
vuzp.32 d17, d19
vmls.f32 d16, d24, d21
vswp d17, d16
# Start of epilogue
vmov r0, r1, d16 @ v4sf
vmov r2, r3, d17
add sp, sp, #80
bx lr
堆栈指针递减/递增显然是序言/结尾的一部分,即使没有使用堆栈也会发生。是符合某些标准,还是gcc优化错误?
编辑:编译器是arm-elf-gcc-4.5(GCC)4.5.0,配置为:/opt/local/var/macports/build/_opt_local_var_macports_sources_rsync.macports.org_release_ports_cross_arm-elf- gcc / work / gcc-4.5.0 / configure --prefix = / opt / local --infodir = / opt / local / share / info --mandir = / opt / local / share / man --target = arm-elf --program-prefix = arm-elf- --program-suffix = -4.5 --without-included-gettext --enable-obsolete --with-newlib --disable -__ cxa_atexit --enable-multilib --enable-biendian --disable-libgfortran --with-gxx-include-dir = / opt / local / arm-elf / include / c ++ / 4.5.0 / --enable-languages = c,c ++,objc --build = x86_64-apple -darwin10 --enable-fpu
编辑:我设法使用以下C源查明问题。它只发生在使用矢量类型数组作为临时数时,例如float32x4x2_t,它被声明为struct { float32x4_t val[2]; }
,即使这些临时数据都是寄存器。我相信这是一个错误,所以我reported it。
#include <arm_neon.h>
// This one is ok
extern float32x4_t add(float32x4_t* v1, float32x4_t* v2) {
return vaddq_f32(*v1, *v2);
#if 0
produced assembly:
add:
vldmia r0, {d16-d17}
vldmia r1, {d18-d19}
vadd.f32 q8, q8, q9
vmov r0, r1, d16
vmov r2, r3, d17
bx lr
#endif
}
// This one uses float32x4x2_t temporaries and has the bug
extern float32x4_t cross(float32x4_t* v1, float32x4_t* v2) {
float32x4x2_t
xxyyzz1=vzipq_f32(*v1, *v1),
xxyyzz2=vzipq_f32(*v2, *v2);
float32x2_t
xx1=vget_low_f32(xxyyzz1.val[0]),
yy1=vget_high_f32(xxyyzz1.val[0]),
zz1=vget_low_f32(xxyyzz1.val[1]),
xx2=vget_low_f32(xxyyzz2.val[0]),
yy2=vget_high_f32(xxyyzz2.val[0]),
zz2=vget_low_f32(xxyyzz2.val[1]);
float32x2_t
x=vmls_f32(vmul_f32(yy1, zz2), zz1, yy2),
y=vmls_f32(vmul_f32(zz1, xx2), xx1, zz2),
z=vmls_f32(vmul_f32(xx1, yy2), yy1, xx2);
return vcombine_f32(vuzp_f32(x, y).val[0], z);
#if 0
produced assembly:
cross:
vldmia r0, {d18-d19}
vldmia r1, {d16-d17}
vmov q10, q9
vmov q11, q8
vzip.32 q9, q10
vzip.32 q8, q11
vmov d24, d19
vmov d21, d16
vmov d16, d17
vmul.f32 d19, d20, d21
vmul.f32 d17, d24, d22
vmls.f32 d17, d20, d16
vmls.f32 d19, d18, d22
vmul.f32 d16, d18, d16
vuzp.32 d17, d19
vmls.f32 d16, d24, d21
sub sp, sp, #48 @ here
vswp d17, d16
vmov r0, r1, d16
vmov r2, r3, d17
add sp, sp, #48 @ and here
bx lr
#endif
}