我正在编译此代码:
#include <cstdint>
template <typename T>
struct vec{ T v[4]; };
template <typename T>
vec<T> foo (vec<T> x, vec<T> y, vec<T> z) {
return {
x.v[0] + y.v[0] * z.v[0],
x.v[1] + y.v[1] * z.v[1],
x.v[2] + y.v[2] * z.v[2],
x.v[3] + y.v[3] * z.v[3]
};
}
template vec<int64_t> foo ( vec<int64_t> x, vec<int64_t> y, vec<int64_t> z);
template vec<float> foo ( vec<float> x, vec<float> y, vec<float> z);
最大优化,使用clang 6.0和gcc 7.3。但是results很奇怪:
-O2
处的clang和更多。这真的更快吗?clang 6.0:
vec<long> foo<long>(vec<long>, vec<long>, vec<long>): # @vec<long> foo<long>(vec<long>, vec<long>, vec<long>)
mov rax, qword ptr [rsp + 72]
imul rax, qword ptr [rsp + 40]
add rax, qword ptr [rsp + 8]
mov qword ptr [rdi], rax
mov rax, qword ptr [rsp + 80]
imul rax, qword ptr [rsp + 48]
add rax, qword ptr [rsp + 16]
mov qword ptr [rdi + 8], rax
mov rax, qword ptr [rsp + 88]
imul rax, qword ptr [rsp + 56]
add rax, qword ptr [rsp + 24]
mov qword ptr [rdi + 16], rax
mov rax, qword ptr [rsp + 96]
imul rax, qword ptr [rsp + 64]
add rax, qword ptr [rsp + 32]
mov qword ptr [rdi + 24], rax
mov rax, rdi
ret
vec<float> foo<float>(vec<float>, vec<float>, vec<float>): # @vec<float> foo<float>(vec<float>, vec<float>, vec<float>)
mulps xmm2, xmm4
addps xmm0, xmm2
mulps xmm3, xmm5
addps xmm1, xmm3
ret
GCC 7.3:
vec<long> foo<long>(vec<long>, vec<long>, vec<long>):
movdqu xmm3, XMMWORD PTR [rsp+56]
mov rax, rdi
movdqu xmm4, XMMWORD PTR [rsp+88]
movdqa xmm1, xmm3
movdqa xmm0, xmm3
movdqa xmm2, xmm4
movdqu xmm5, XMMWORD PTR [rsp+72]
pmuludq xmm1, xmm4
psrlq xmm0, 32
psrlq xmm2, 32
pmuludq xmm0, xmm4
pmuludq xmm2, xmm3
movdqu xmm4, XMMWORD PTR [rsp+40]
paddq xmm0, xmm2
psllq xmm0, 32
paddq xmm0, xmm1
movdqa xmm3, xmm5
movdqu xmm1, XMMWORD PTR [rsp+24]
movdqa xmm2, xmm4
psrlq xmm3, 32
pmuludq xmm3, xmm4
paddq xmm1, xmm0
movdqu xmm6, XMMWORD PTR [rsp+8]
pmuludq xmm2, xmm5
movdqa xmm0, xmm4
movups XMMWORD PTR [rdi+16], xmm1
psrlq xmm0, 32
pmuludq xmm0, xmm5
paddq xmm0, xmm3
psllq xmm0, 32
paddq xmm0, xmm2
paddq xmm0, xmm6
movups XMMWORD PTR [rdi], xmm0
ret
vec<float> foo<float>(vec<float>, vec<float>, vec<float>):
movq QWORD PTR [rsp-40], xmm2
movq QWORD PTR [rsp-32], xmm3
movq QWORD PTR [rsp-56], xmm0
movq QWORD PTR [rsp-24], xmm4
movq QWORD PTR [rsp-16], xmm5
movq QWORD PTR [rsp-48], xmm1
movaps xmm0, XMMWORD PTR [rsp-40]
mulps xmm0, XMMWORD PTR [rsp-24]
addps xmm0, XMMWORD PTR [rsp-56]
movaps XMMWORD PTR [rsp-56], xmm0
mov rax, QWORD PTR [rsp-48]
movq xmm0, QWORD PTR [rsp-56]
mov QWORD PTR [rsp-56], rax
movq xmm1, QWORD PTR [rsp-56]
ret
答案 0 :(得分:4)
首先,您需要启用FMA硬件,例如使用-mfma
然后使用Clang,您需要告诉它合同-ffp-contract=fast
(GCC和ICC默认执行此操作)或添加#pragma STDC FP_CONTRACT ON
https://stackoverflow.com/a/34461738/2542702。随着Clang的产生
vfmadd213ps xmm2, xmm4, xmm0
vfmadd213ps xmm3, xmm5, xmm1
vmovaps xmm0, xmm2
vmovaps xmm1, xmm3
使用GCC使用向量扩展名获得最佳结果
typedef float float4 __attribute__((vector_size(sizeof(float)*4)));
float4 foof(float4 z, float4 y, float4 x) {
return x + y*z;
}
GCC和Clang只生成
vfmadd132ps xmm0, xmm2, xmm1
根据我的经验,Clang似乎在数组和循环矢量扩展方面做得比GCC更好,但GCC中GCC的向量扩展是最好的支持。