我正在将Eigen版本3.3.5与MSVC VC ++ VS2013一起使用。
我已经阅读了很多有关MSVC和其他编译器下的特征和向量化的文章。
与编译器相关的选项:
我承认我是msvc编译器家族的新手,围绕它进行模板操作可能比gcc或clang效率低。
void fooa(Vector3f& u, Vector3f& v, Vector3f& w, Affine3f & t){
EIGEN_ASM_COMMENT("begin mul v*t");
u = t*u;
v = t*u;
w = t*u;
EIGEN_ASM_COMMENT("end mul v*t");
}
msvc下的上述代码产生对的调用:
Eigen::internal::evaluator<Eigen::Block<Eigen::Matrix<float,4,1,0,4,1>,3,1,0> >::evaluator<Eigen::Block<Eigen::Matrix<float,4,1,0,4,1>,3,1,0> >
和:
Eigen::DenseStorage<float,4,4,1,0>::DenseStorage<float,4,4,1,0> (013F222370h)
虽然gcc不会,不会对性能产生明显影响,但我的应用程序使用了很多此类操作。
部分矢量化了吗?这是模板问题吗?或优化问题? 最重要的是,我是否可以做一些事情来产生类似于以下与gcc相关的输出?
MSVC:
EIGEN_ASM_COMMENT("inizio prova mul v*t");
u = t*u;
000000013F221431 test al,0Fh
000000013F221433 je fooa+4Fh (013F22144Fh)
000000013F221435 lea rdx,[string L"c:\\users\\user\\docume"... (013F224550h)]
000000013F22143C lea rcx,[string L"(internal::UIntPtr(a"... (013F2245F0h)]
000000013F221443 mov r8d,6Dh
000000013F221449 call qword ptr [__imp__wassert (013F224160h)]
000000013F22144F movss xmm0,dword ptr [rbx]
000000013F221453 movss xmm4,dword ptr [__real@3f800000 (013F224990h)]
000000013F22145B lea rax,[rbp+27h]
000000013F22145F movss dword ptr [rbp-39h],xmm0
000000013F221464 movss xmm1,dword ptr [rbx+4]
000000013F221469 movss dword ptr [rbp-35h],xmm1
000000013F22146E movss xmm3,dword ptr [rbx+8]
000000013F221473 movss dword ptr [rbp-2Dh],xmm4
000000013F221478 movss dword ptr [rbp-31h],xmm3
000000013F22147D test al,0Fh
000000013F22147F je fooa+0AFh (013F2214AFh)
000000013F221481 lea rdx,[string L"c:\\users\\user\\docume"... (013F224550h)]
000000013F221488 lea rcx,[string L"(internal::UIntPtr(a"... (013F2245F0h)]
000000013F22148F mov r8d,6Dh
000000013F221495 call qword ptr [__imp__wassert (013F224160h)]
000000013F22149B movss xmm4,dword ptr [rbp-2Dh]
000000013F2214A0 movss xmm3,dword ptr [rbp-31h]
000000013F2214A5 movss xmm1,dword ptr [rbp-35h]
000000013F2214AA movss xmm0,dword ptr [rbp-39h]
000000013F2214AF movss xmm2,xmm0
000000013F2214B3 movaps xmm0,xmm1
000000013F2214B6 movaps xmm1,xmm3
v = t*u;
000000013F2214B9 lea rcx,[rbp-39h]
EIGEN_ASM_COMMENT("inizio prova mul v*t");
u = t*u;
000000013F2214BD shufps xmm0,xmm0,0
000000013F2214C1 shufps xmm2,xmm2,0
000000013F2214C5 shufps xmm1,xmm1,0
000000013F2214C9 mulps xmm0,xmmword ptr [rdi+10h]
000000013F2214CD mulps xmm1,xmmword ptr [rdi+20h]
000000013F2214D1 mulps xmm2,xmmword ptr [rdi]
000000013F2214D4 addps xmm2,xmm0
000000013F2214D7 movss xmm0,xmm4
000000013F2214DB addps xmm2,xmm1
000000013F2214DE shufps xmm0,xmm0,0
000000013F2214E2 mulps xmm0,xmmword ptr [rdi+30h]
000000013F2214E6 addps xmm2,xmm0
000000013F2214E9 movaps xmm0,xmm2
000000013F2214EC movaps xmm1,xmm2
000000013F2214EF shufps xmm0,xmm2,55h
000000013F2214F3 shufps xmm1,xmm2,0AAh
EIGEN_ASM_COMMENT("inizio prova mul v*t");
u = t*u;
000000013F2214F7 movss dword ptr [rbp-25h],xmm0
000000013F2214FC movss dword ptr [rbp-21h],xmm1
000000013F221501 movss dword ptr [rbx],xmm2
000000013F221505 movss xmm0,dword ptr [rbp-25h]
000000013F22150A movss dword ptr [rbx+4],xmm0
000000013F22150F movss xmm1,dword ptr [rbp-21h]
000000013F221514 movss dword ptr [rbx+8],xmm1
v = t*u;
000000013F221519 call Eigen::DenseStorage<float,4,4,1,0>::DenseStorage<float,4,4,1,0> (013F222370h)
000000013F22151E movss xmm0,dword ptr [rbx]
000000013F221522 lea rcx,[rbp+27h]
000000013F221526 movss dword ptr [rbp-39h],xmm0
000000013F22152B movss xmm1,dword ptr [rbx+4]
000000013F221530 movss dword ptr [rbp-35h],xmm1
000000013F221535 movss xmm0,dword ptr [rbx+8]
000000013F22153A mov dword ptr [rbp-2Dh],3F800000h
000000013F221541 movss dword ptr [rbp-31h],xmm0
000000013F221546 call Eigen::DenseStorage<float,4,4,1,0>::DenseStorage<float,4,4,1,0> (013F222370h)
000000013F22154B lea rax,[rbp-39h]
000000013F22154F lea rdx,[rbp+37h]
000000013F221553 mov qword ptr [rbp-11h],rax
000000013F221557 lea rax,[rbp-39h]
000000013F22155B lea rcx,[rbp-19h]
000000013F22155F xor r8d,r8d
000000013F221562 mov qword ptr [rbp-19h],rdi
000000013F221566 mov qword ptr [rbp-9],rdi
000000013F22156A mov qword ptr [rbp+7],rax
000000013F22156E mov qword ptr [rbp+17h],4
000000013F221576 call Eigen::internal::product_evaluator<Eigen::Product<Eigen::Matrix<float,4,4,0,4,4>,Eigen::Matrix<float,4,1,0,4,1>,1>,3,Eigen::DenseShape,Eigen::DenseShape,float,float>::packet<16,__m128> (013F222420h)
w = t*u;
000000013F22157B lea rcx,[rbp-39h]
v = t*u;
000000013F22157F movaps xmm2,xmmword ptr [rax]
000000013F221582 movaps xmm0,xmm2
000000013F221585 movaps xmm1,xmm2
000000013F221588 shufps xmm0,xmm2,55h
000000013F22158C shufps xmm1,xmm2,0AAh
000000013F221590 movss dword ptr [rbp-25h],xmm0
000000013F221595 movss dword ptr [rbp-21h],xmm1
000000013F22159A movss dword ptr [r14],xmm2
000000013F22159F movss xmm0,dword ptr [rbp-25h]
000000013F2215A4 movss dword ptr [r14+4],xmm0
000000013F2215AA movss xmm1,dword ptr [rbp-21h]
000000013F2215AF movss dword ptr [r14+8],xmm1
w = t*u;
000000013F2215B5 call Eigen::DenseStorage<float,4,4,1,0>::DenseStorage<float,4,4,1,0> (013F222370h)
000000013F2215BA lea rax,[rbp-39h]
000000013F2215BE xor r14d,r14d
000000013F2215C1 mov qword ptr [rbp-19h],rax
000000013F2215C5 lea rax,[rbp-39h]
000000013F2215C9 lea rdx,[rbp-19h]
000000013F2215CD lea rcx,[rbp+27h]
000000013F2215D1 mov qword ptr [rbp-1],r14
000000013F2215D5 mov qword ptr [rbp+0Fh],4
000000013F2215DD mov qword ptr [rbp-9],rax
000000013F2215E1 call Eigen::internal::evaluator<Eigen::Block<Eigen::Matrix<float,4,1,0,4,1>,3,1,0> >::evaluator<Eigen::Block<Eigen::Matrix<float,4,1,0,4,1>,3,1,0> > (013F2223B0h)
000000013F2215E6 mov eax,dword ptr [rbx]
000000013F2215E8 mov rcx,qword ptr [rbp+27h]
000000013F2215EC mov dword ptr [rcx],eax
000000013F2215EE mov eax,dword ptr [rbx+4]
000000013F2215F1 mov dword ptr [rcx+4],eax
000000013F2215F4 mov eax,dword ptr [rbx+8]
000000013F2215F7 mov dword ptr [rcx+8],eax
000000013F2215FA mov dword ptr [rbp-2Dh],3F800000h
000000013F221601 lea rcx,[rbp+37h]
000000013F221605 call Eigen::DenseStorage<float,4,4,1,0>::DenseStorage<float,4,4,1,0> (013F222370h)
000000013F22160A lea rax,[rbp-39h]
000000013F22160E lea rdx,[rbp+47h]
000000013F221612 mov qword ptr [rbp-11h],rax
000000013F221616 lea rax,[rbp-39h]
000000013F22161A lea rcx,[rbp-19h]
000000013F22161E xor r8d,r8d
000000013F221621 mov qword ptr [rbp-19h],rdi
000000013F221625 mov qword ptr [rbp-9],rdi
000000013F221629 mov qword ptr [rbp+7],rax
000000013F22162D mov qword ptr [rbp+17h],4
000000013F221635 call Eigen::internal::product_evaluator<Eigen::Product<Eigen::Matrix<float,4,4,0,4,4>,Eigen::Matrix<float,4,1,0,4,1>,1>,3,Eigen::DenseShape,Eigen::DenseShape,float,float>::packet<16,__m128> (013F222420h)
000000013F22163A lea rdx,[rbp-19h]
000000013F22163E lea rcx,[rbp+27h]
000000013F221642 mov qword ptr [rbp-1],r14
000000013F221646 movaps xmm0,xmmword ptr [rax]
000000013F221649 movaps xmmword ptr [rbp+37h],xmm0
000000013F22164D lea rax,[rbp+37h]
000000013F221651 mov qword ptr [rbp+0Fh],4
000000013F221659 mov qword ptr [rbp-19h],rax
000000013F22165D lea rax,[rbp+37h]
000000013F221661 mov qword ptr [rbp-9],rax
000000013F221665 call Eigen::internal::evaluator<Eigen::Block<Eigen::Matrix<float,4,1,0,4,1>,3,1,0> >::evaluator<Eigen::Block<Eigen::Matrix<float,4,1,0,4,1>,3,1,0> > (013F2223B0h)
000000013F22166A mov rax,qword ptr [rbp+27h]
EIGEN_ASM_COMMENT("fine prova mul v*t");
}
000000013F22166E lea r11,[rsp+0B0h]
w = t*u;
000000013F221676 movss xmm2,dword ptr [rax]
000000013F22167A movss dword ptr [rbp-29h],xmm2
000000013F22167F movss xmm0,dword ptr [rax+4]
000000013F221684 movss dword ptr [rbp-25h],xmm0
000000013F221689 movss xmm1,dword ptr [rax+8]
EIGEN_ASM_COMMENT("fine prova mul v*t");
}
000000013F22168E mov rbx,qword ptr [r11+10h]
000000013F221692 mov rdi,qword ptr [r11+20h]
000000013F221696 mov r14,qword ptr [r11+28h]
w = t*u;
000000013F22169A movss dword ptr [rbp-21h],xmm1
000000013F22169F movss dword ptr [rsi],xmm2
000000013F2216A3 movss xmm0,dword ptr [rbp-25h]
000000013F2216A8 movss dword ptr [rsi+4],xmm0
000000013F2216AD movss xmm1,dword ptr [rbp-21h]
000000013F2216B2 movss dword ptr [rsi+8],xmm1
EIGEN_ASM_COMMENT("fine prova mul v*t");
}
GCC:
# 18 "main.cpp" 1
#inizio prova mul v*t
# 0 "" 2
#NO_APP
movss 4(%rdi), %xmm0
shufps $0, %xmm0, %xmm0
movss (%rdi), %xmm1
shufps $0, %xmm1, %xmm1
movaps (%rcx), %xmm2
mulps 16(%rcx), %xmm0
mulps %xmm1, %xmm2
movaps %xmm0, %xmm1
movss 8(%rdi), %xmm0
shufps $0, %xmm0, %xmm0
addps %xmm2, %xmm1
mulps 32(%rcx), %xmm0
addps %xmm1, %xmm0
addps 48(%rcx), %xmm0
movaps %xmm0, (%rsp)
movss (%rsp), %xmm0
movss 4(%rsp), %xmm1
movss 8(%rsp), %xmm3
movss %xmm0, (%rdi)
shufps $0, %xmm0, %xmm0
movss %xmm1, 4(%rdi)
shufps $0, %xmm1, %xmm1
movss %xmm3, 8(%rdi)
movaps (%rcx), %xmm2
mulps 16(%rcx), %xmm1
mulps %xmm0, %xmm2
movaps %xmm3, %xmm0
shufps $0, %xmm0, %xmm0
addps %xmm2, %xmm1
mulps 32(%rcx), %xmm0
addps %xmm1, %xmm0
addps 48(%rcx), %xmm0
movaps %xmm0, (%rsp)
movss (%rsp), %xmm0
movss %xmm0, (%rsi)
movss 4(%rsp), %xmm0
movss %xmm0, 4(%rsi)
movss 8(%rsp), %xmm0
movss %xmm0, 8(%rsi)
movss 4(%rdi), %xmm0
shufps $0, %xmm0, %xmm0
movss (%rdi), %xmm1
shufps $0, %xmm1, %xmm1
movaps (%rcx), %xmm2
mulps 16(%rcx), %xmm0
mulps %xmm1, %xmm2
movaps %xmm0, %xmm1
movss 8(%rdi), %xmm0
shufps $0, %xmm0, %xmm0
addps %xmm2, %xmm1
mulps 32(%rcx), %xmm0
addps %xmm1, %xmm0
addps 48(%rcx), %xmm0
movaps %xmm0, (%rsp)
movss (%rsp), %xmm0
movss %xmm0, (%rdx)
movss 4(%rsp), %xmm0
movss %xmm0, 4(%rdx)
movss 8(%rsp), %xmm0
movss %xmm0, 8(%rdx)
#APP
# 22 "main.cpp" 1
#fine prova mul v*t
# 0 "" 2