我正在研究向量运算的向量化。作为示例,我采用了3个向量的通用乘法和加法。我使用Eigen的Vector数据类型来确保对齐。
#include "pch.h"
#include <iostream>
#include "Core"
#include <chrono>
using Eigen::RowVectorXd;
using std::chrono::high_resolution_clock;
using std::chrono::nanoseconds;
using std::chrono::duration_cast;
//using tbb::tick_count;
int main()
{
std::cout<<EIGEN_DEFAULT_ALIGN_BYTES<<'\n';
const int length = 1000;
RowVectorXd v1 = RowVectorXd::Constant(length, 4.0);
RowVectorXd v2 = RowVectorXd::Constant(length, 6.0);
RowVectorXd v3 = RowVectorXd::Constant(length, 7.0);
RowVectorXd output(length);
auto s = high_resolution_clock::now();
for (int i = 0; i < length; ++i)
output(i) = v1(i) + v2(i) * v3(i);
auto e = high_resolution_clock::now();
std::cout << output(0) << " Plain loop:" << duration_cast<nanoseconds>(s - e).count()/1000.0<<'\n';
return 0;
}
MSVC 17愉快地展开并向量化了循环。 设置:
/permissive- /Yu"pch.h" /GS /GL /analyze- /W3 /Gy /Zc:wchar_t /Zi /Gm- /O2 /sdl /Fd"Release\vc141.pdb" /Zc:inline /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /errorReport:prompt /WX- /Zc:forScope /Gd /Oy- /MD /std:c++14 /FC /Fa"Release\" /EHsc /nologo /Fo"Release\" /Fp"Release\test_loops.pch" /diagnostics:classic
反汇编:
output(i) = v1(i) + v2(i) * v3(i);
01121580 movups xmm1,xmmword ptr [edx+esi*8]
01121584 mov edx,dword ptr [ebp-6Ch]
01121587 add esi,8
0112158A movups xmm0,xmmword ptr [edi-10h]
0112158E mulpd xmm1,xmm0
01121592 movups xmm0,xmmword ptr [eax-20h]
01121596 addpd xmm1,xmm0
0112159A movups xmm0,xmmword ptr [edi]
0112159D movups xmmword ptr [ecx-30h],xmm1
011215A1 movups xmm1,xmmword ptr [edx+edi]
011215A5 mov edx,dword ptr [ebp-4Ch]
011215A8 mulpd xmm1,xmm0
011215AC movups xmm0,xmmword ptr [edx+edi]
011215B0 mov edx,dword ptr [ebp-94h]
011215B6 addpd xmm1,xmm0
011215BA movups xmm0,xmmword ptr [edi+10h]
011215BE movups xmmword ptr [edx+edi],xmm1
011215C2 mov edx,dword ptr [ebp-64h]
011215C5 movups xmm1,xmmword ptr [edx+eax]
011215C9 mov edx,dword ptr [ebp-8Ch]
011215CF mulpd xmm1,xmm0
011215D3 movups xmm0,xmmword ptr [eax]
011215D6 addpd xmm1,xmm0
011215DA movups xmm0,xmmword ptr [edi+20h]
011215DE add edi,40h
011215E1 movups xmmword ptr [edx+eax],xmm1
011215E5 mov edx,dword ptr [ebp-34h]
011215E8 movups xmm1,xmmword ptr [edx+ecx]
011215EC mov edx,dword ptr [v3]
011215EF mulpd xmm1,xmm0
011215F3 movups xmm0,xmmword ptr [eax+10h]
011215F7 add eax,40h
011215FA addpd xmm1,xmm0
011215FE movups xmmword ptr [ecx],xmm1
01121601 add ecx,40h
01121604 cmp esi,3E8h
0112160A jl main+3E0h (01121580h
)
但是,英特尔Compiler 18更新4无法矢量化。设置:
/permissive- /Yu"pch.h" /GS /W3 /Gy /Zc:wchar_t /Zi /O2 /Qopt-report:5 /Qopt-report-phase:vec /Fd"Release\vc141.pdb" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /Qipo /Zc:forScope /Gd /MD /std:c++14 /FC /Fa"Release\" /EHsc /nologo /Fo"Release\" /Qprof-dir "Release\" /Fp"Release\test_loops.pch"
反汇编:
for (int i = 0; i < length; ++i)
002113D0 xor edx,edx
002113D2 xor eax,eax
002113D4 nop dword ptr [eax+eax]
002113D9 nop dword ptr [eax]
output(i) = v1(i) + v2(i) * v3(i);
002113E0 mov ecx,dword ptr [v2]
for (int i = 0; i < length; ++i)
002113E3 inc edx
output(i) = v1(i) + v2(i) * v3(i);
002113E4 mov esi,dword ptr [v3]
002113E7 mov edi,dword ptr [v1]
002113EA movsd xmm0,mmword ptr [ecx+eax]
002113EF mulsd xmm0,mmword ptr [esi+eax]
002113F4 mov ecx,dword ptr [output]
002113F7 addsd xmm0,mmword ptr [edi+eax]
002113FC movsd mmword ptr [ecx+eax],xmm0
00211401 mov esi,dword ptr [v2]
00211404 mov edi,dword ptr [v3]
00211407 mov ecx,dword ptr [v1]
0021140A movsd xmm1,mmword ptr [esi+eax+8]
00211410 mulsd xmm1,mmword ptr [edi+eax+8]
00211416 mov esi,dword ptr [output]
00211419 addsd xmm1,mmword ptr [ecx+eax+8]
output(i) = v1(i) + v2(i) * v3(i);
0021141F movsd mmword ptr [esi+eax+8],xmm1
for (int i = 0; i < length; ++i)
00211425 add eax,10h
00211428 cmp edx,1F4h
0021142E jb main+3E0h (02113E0h)
那怎么了?与MSVC相比,为什么英特尔编译器无法向量化?