我为我的项目选择了eigenlib,因为我处理了许多小规模的矢量和矩阵运算。当然,我在eigenlib中实现了简单的vector-Matrix-vector产品作为这个函数:
const int dims = 2;
template <typename T>
using Vec = Eigen::Matrix<T, dims, 1>;
template <typename T>
using Mat = Eigen::Matrix<T, dims, dims>;
template <typename T>
inline auto quadraticMetricNorm(const Vec<T>& x1, const Vec<T>& x2, const Mat<T>& D)
{
return x1.transpose() * D * x2;
}
我希望生成的代码类似于手工制作的解决方案:
template <typename T>
inline auto quadraticMetricNorm(const Vec<T>& x1, const Vec<T>& x2, const Mat<T>& D)
{
return (x1(0, 0) * D(0, 0) + x1(1, 0) * D(1, 0)) * x2(0, 0)
+ (x1(0, 0) * D(0, 1) + x1(1, 0) * D(1, 1)) * x2(1, 0);
}
如果我激活AVX2,甚至会更快一点,但经过一些测试后我发现根据我的项目中的探查器,手工制作的解决方案快了近3倍。我做错了什么,或者Eigenlib的表现是否仅仅因为这些小规模问题而劣势?
示例:
int main()
{
random_device rd;
mt19937 mt(rd());
uniform_real_distribution<double> dist(1.0, 10.0);
std::chrono::nanoseconds total{};
const uint64_t max_loops = 10000000;
std::vector<double> arr(max_loops);
for (uint64_t i = 0; i < max_loops; i++)
{
Vec<double> x1;
x1 << dist(mt), dist(mt);
Vec<double> x2;
x2 << dist(mt), dist(mt);
Mat<double> D;
D << dist(mt), dist(mt), dist(mt), dist(mt);
auto start = chrono::high_resolution_clock::now();
arr[i] = quadraticMetricNorm(x1, x2, D);
total += chrono::duration_cast<std::chrono::nanoseconds>(chrono::high_resolution_clock::now() - start);
}
cout << "Loop took " << total.count() / 1000000 << "ms" << endl;
return 0;
}
第一个版本的循环耗时约为850毫秒,而计算机上的第二个版本约为440毫秒。
我在Visual Studio Community 2017上运行Intel i7-5820K CPU,在释放模式下激活AVX2(最大化速度(/ O2))。
提前感谢您的帮助!
自请求以来,这是两个程序集。速度越快:
addsd xmm1, xmm0
mulsd xmm1, xmm10
mulsd xmm12, xmm7
mulsd xmm13, xmm9
addsd xmm12, xmm13
mulsd xmm12, xmm11
addsd xmm1, xmm12
Eigenlib生成的代码要复杂得多(没有AVX2):
lea r8, QWORD PTR D$11[rbp-256]
lea rdx, QWORD PTR $T7[rbp-256]
lea rcx, QWORD PTR $T3[rsp]
call ??$?DV?$Matrix@N$01$01$0A@$01$01@Eigen@@@?$MatrixBase@V?$Transpose@$$CBV?$Matrix@N$01$00$0A@$01$00@Eigen@@@Eigen@@@Eigen@@QEBA?BV?$Product@V?$Transpose@$$CBV?$Matrix@N$01$00$0A@$01$00@Eigen@@@Eigen@@V?$Matrix@N$01$01$0A@$01$01@2@$0A@@1@AEBV?$MatrixBase@V?$Matrix@N$01$01$0A@$01$01@Eigen@@@1@@Z ; Eigen::MatrixBase<Eigen::Transpose<Eigen::Matrix<double,2,1,0,2,1> const > >::operator*<Eigen::Matrix<double,2,2,0,2,2> >
mov rcx, rax
lea r8, QWORD PTR x2$9[rbp-256]
lea rdx, QWORD PTR $T5[rsp]
call ??$?DV?$Matrix@N$01$00$0A@$01$00@Eigen@@@?$MatrixBase@V?$Product@V?$Transpose@$$CBV?$Matrix@N$01$00$0A@$01$00@Eigen@@@Eigen@@V?$Matrix@N$01$01$0A@$01$01@2@$0A@@Eigen@@@Eigen@@QEBA?BV?$Product@V?$Product@V?$Transpose@$$CBV?$Matrix@N$01$00$0A@$01$00@Eigen@@@Eigen@@V?$Matrix@N$01$01$0A@$01$01@2@$0A@@Eigen@@V?$Matrix@N$01$00$0A@$01$00@2@$0A@@1@AEBV?$MatrixBase@V?$Matrix@N$01$00$0A@$01$00@Eigen@@@1@@Z ; Eigen::MatrixBase<Eigen::Product<Eigen::Transpose<Eigen::Matrix<double,2,1,0,2,1> const >,Eigen::Matrix<double,2,2,0,2,2>,0> >::operator*<Eigen::Matrix<double,2,1,0,2,1> >
; File ...\eigenlib\eigen\src\core\coreevaluators.h
; 155 : : m_data(m.data()), m_outerStride(IsVectorAtCompileTime ? 0 : m.outerStride())
lea rax, QWORD PTR $T10[rbp-240]
mov QWORD PTR $T10[rbp-256], rax
; File ...\eigenlib\eigen\src\core\cwisebinaryop.h
; 105 : : m_lhs(aLhs), m_rhs(aRhs), m_functor(func)
movups xmm0, XMMWORD PTR $T5[rsp]
movups XMMWORD PTR $T6[rsp+8], xmm0
mov rax, QWORD PTR $T5[rsp+16]
mov QWORD PTR $T6[rbp-232], rax
; File ...\eigenlib\eigen\src\core\redux.h
; 453 : return derived().redux(Eigen::internal::scalar_sum_op<Scalar,Scalar>());
lea rdx, QWORD PTR $T1[rsp]
lea rcx, QWORD PTR $T6[rsp]
call ??$redux@U?$scalar_sum_op@NN@internal@Eigen@@@?$DenseBase@V?$CwiseBinaryOp@U?$scalar_product_op@NN@internal@Eigen@@$$CBV?$Transpose@$$CBV?$Product@V?$Transpose@$$CBV?$Matrix@N$01$00$0A@$01$00@Eigen@@@Eigen@@V?$Matrix@N$01$01$0A@$01$01@2@$0A@@Eigen@@@3@$$CBV?$Matrix@N$01$00$0A@$01$00@3@@Eigen@@@Eigen@@QEBANAEBU?$scalar_sum_op@NN@internal@1@@Z ; Eigen::DenseBase<Eigen::CwiseBinaryOp<Eigen::internal::scalar_product_op<double,double>,Eigen::Transpose<Eigen::Product<Eigen::Transpose<Eigen::Matrix<double,2,1,0,2,1> const >,Eigen::Matrix<double,2,2,0,2,2>,0> const > const ,Eigen::Matrix<double,2,1,0,2,1> const > >::redux<Eigen::internal::scalar_sum_op<double,double> >
movsd QWORD PTR $T10[rbp-240], xmm0
所以Visual Studio和Eigenlib根本不能很好地融合在一起吗?