我正在研究所使用的技巧How to multiply two quaternions with minimal instructions?,并对我的gcc实施的劣势感到沮丧:
template <typename T> struct quat;
template <> struct quat<float> { float __attribute__((vector_size(16))) data_; };
inline quat<float> operator*(quat<float> const& l, quat<float> const& r) noexcept
{
// l(0)r(3) + l(1)r(2) - l(2)r(1) + l(3)r(0)
// -l(0)r(2) + l(1)r(3) + l(2)r(0) + l(3)r(1)
// l(0)r(1) - l(1)r(0) + l(2)r(3) + l(3)r(2)
// -l(0)r(0) - l(1)r(1) - l(2)r(2) + l(3)r(3)
using int_value_type = ::std::int32_t;
using int_vector_type = ::std::int32_t __attribute__((vector_size(16)));
auto const t1(
l.data_ *
__builtin_shuffle(r.data_, int_vector_type{3, 3, 3, 3})
);
auto const t2(
__builtin_shuffle(l.data_, int_vector_type{1, 2, 0, 2}) *
__builtin_shuffle(r.data_, int_vector_type{2, 0, 1, 2})
);
auto const t3(
__builtin_shuffle(l.data_, int_vector_type{3, 3, 3, 1}) *
__builtin_shuffle(r.data_, int_vector_type{0, 1, 2, 1})
);
auto const t4(
__builtin_shuffle(l.data_, int_vector_type{2, 0, 1, 0}) *
__builtin_shuffle(r.data_, int_vector_type{1, 2, 0, 0})
);
constexpr int_vector_type const mask{
0, 0, 0, 1 << 8 * sizeof(int_value_type) - 1
};
return {
t1 +
decltype(t2)(int_vector_type(t2) ^ mask) +
decltype(t3)(int_vector_type(t3) ^ mask) -
t4
};
}
这使得4个MUL,2个ADD,1个SUB和2个XOR。在gcc向量扩展或数学中是否有一些技巧,我错过了,这可能会使我的实现更接近更优化的那个?
编辑:使用-O3 -ffast-math
Dump of assembler code for function vxl::operator*<float>(vxl::quat<float> const&, vxl::quat<float> const&):
0x0000000000400980 <+0>: movaps (%rsi),%xmm2
0x0000000000400983 <+3>: movaps (%rdi),%xmm1
0x0000000000400986 <+6>: movaps %xmm2,%xmm0
0x0000000000400989 <+9>: movaps %xmm2,%xmm4
0x000000000040098c <+12>: movaps %xmm1,%xmm3
0x000000000040098f <+15>: shufps $0xff,%xmm2,%xmm0
0x0000000000400993 <+19>: shufps $0x9,%xmm2,%xmm4
0x0000000000400997 <+23>: shufps $0x12,%xmm1,%xmm3
0x000000000040099b <+27>: mulps %xmm1,%xmm0
0x000000000040099e <+30>: mulps %xmm4,%xmm3
0x00000000004009a1 <+33>: movaps %xmm0,%xmm4
0x00000000004009a4 <+36>: movaps %xmm1,%xmm0
0x00000000004009a7 <+39>: subps %xmm3,%xmm4
0x00000000004009aa <+42>: movaps %xmm2,%xmm3
0x00000000004009ad <+45>: shufps $0x7f,%xmm1,%xmm0
0x00000000004009b1 <+49>: shufps $0x64,%xmm2,%xmm3
0x00000000004009b5 <+53>: shufps $0x89,%xmm1,%xmm1
0x00000000004009b9 <+57>: shufps $0x92,%xmm2,%xmm2
0x00000000004009bd <+61>: mulps %xmm3,%xmm0
0x00000000004009c0 <+64>: movaps 0xa9(%rip),%xmm3 # 0x400a70
0x00000000004009c7 <+71>: mulps %xmm2,%xmm1
0x00000000004009ca <+74>: xorps %xmm3,%xmm0
0x00000000004009cd <+77>: xorps %xmm3,%xmm1
0x00000000004009d0 <+80>: addps %xmm0,%xmm1
0x00000000004009d3 <+83>: addps %xmm1,%xmm4
0x00000000004009d6 <+86>: movaps %xmm4,%xmm0
0x00000000004009d9 <+89>: retq
EDIT2:根据Marc的建议和铿锵声,代码变为:
Dump of assembler code for function vxl::operator*<float>(vxl::quat<float> const&, vxl::quat<float> const&):
0x0000000000400ae0 <+0>: movaps (%rdi),%xmm1
0x0000000000400ae3 <+3>: movdqa (%rsi),%xmm2
0x0000000000400ae7 <+7>: pshufd $0xff,%xmm2,%xmm3
0x0000000000400aec <+12>: mulps %xmm1,%xmm3
0x0000000000400aef <+15>: pshufd $0x89,%xmm1,%xmm0
0x0000000000400af4 <+20>: pshufd $0x92,%xmm2,%xmm4
0x0000000000400af9 <+25>: mulps %xmm0,%xmm4
0x0000000000400afc <+28>: pshufd $0x7f,%xmm1,%xmm5
0x0000000000400b01 <+33>: pshufd $0x64,%xmm2,%xmm0
0x0000000000400b06 <+38>: mulps %xmm5,%xmm0
0x0000000000400b09 <+41>: pshufd $0x12,%xmm1,%xmm1
0x0000000000400b0e <+46>: pshufd $0x9,%xmm2,%xmm2
0x0000000000400b13 <+51>: mulps %xmm1,%xmm2
0x0000000000400b16 <+54>: addps %xmm4,%xmm0
0x0000000000400b19 <+57>: xorps 0xc0(%rip),%xmm0 # 0x400be0
0x0000000000400b20 <+64>: addps %xmm3,%xmm0
0x0000000000400b23 <+67>: subps %xmm2,%xmm0
0x0000000000400b26 <+70>: retq
这还不错。
答案 0 :(得分:1)
你可以使用与in the question you refer to基本相同的代码(说实话,我还没有测试过,但我相信它应该有效):
#include <pmmintrin.h>
inline quat<float> operator*(quat<float> const& xyzw, quat<float> const& abcd) noexcept
{
/* The product of two quaternions is: */
/* (X,Y,Z,W) = (xd+yc-zb+wa, -xc+yd+za+wb, xb-ya+zd+wc, -xa-yb-zc+wd) */
__m128 wzyx = _mm_shuffle_ps(xyzw.data_, xyzw.data_, _MM_SHUFFLE(0,1,2,3));
__m128 baba = _mm_shuffle_ps(abcd.data_, abcd.data_, _MM_SHUFFLE(0,1,0,1));
__m128 dcdc = _mm_shuffle_ps(abcd.data_, abcd.data_, _MM_SHUFFLE(2,3,2,3));
/* variable names below are for parts of componens of result (X,Y,Z,W) */
/* nX stands for -X and similarly for the other components */
/* znxwy = (xb - ya, zb - wa, wd - zc, yd - xc) */
__m128 ZnXWY = _mm_hsub_ps(_mm_mul_ps(xyzw.data_, baba), _mm_mul_ps(wzyx, dcdc));
/* xzynw = (xd + yc, zd + wc, wb + za, yb + xa) */
__m128 XZYnW = _mm_hadd_ps(_mm_mul_ps(xyzw.data_, dcdc), _mm_mul_ps(wzyx, baba));
__m128 XZWY = _mm_addsub_ps(_mm_shuffle_ps(XZYnW, ZnXWY, _MM_SHUFFLE(3,2,1,0)),
_mm_shuffle_ps(ZnXWY, XZYnW, _MM_SHUFFLE(2,3,0,1)));
/* now we only need to shuffle the components in place and return the result */
return quat<float>{_mm_shuffle_ps(XZWY, XZWY, _MM_SHUFFLE(2,1,3,0))};
/* operations: 6 shuffles, 4 multiplications, 3 compound additions/subtractions */
}