Eigenlib和小矩阵运算的性能

时间:2018-01-24 14:39:45

标签: c++ performance blas eigen3 avx2

我为我的项目选择了eigenlib,因为我处理了许多小规模的矢量和矩阵运算。当然,我在eigenlib中实现了简单的vector-Matrix-vector产品作为这个函数:

const int dims = 2;

template <typename T>
using Vec = Eigen::Matrix<T, dims, 1>;

template <typename T>
using Mat = Eigen::Matrix<T, dims, dims>;

template <typename T>
inline auto quadraticMetricNorm(const Vec<T>& x1, const Vec<T>& x2, const Mat<T>& D)
{
    return x1.transpose() * D * x2;
}

我希望生成的代码类似于手工制作的解决方案:

template <typename T>
inline auto quadraticMetricNorm(const Vec<T>& x1, const Vec<T>& x2, const Mat<T>& D)
{
    return (x1(0, 0) * D(0, 0) + x1(1, 0) * D(1, 0)) * x2(0, 0) 
         + (x1(0, 0) * D(0, 1) + x1(1, 0) * D(1, 1)) * x2(1, 0);
}
如果我激活AVX2,甚至会更快一点,但经过一些测试后我发现根据我的项目中的探查器,手工制作的解决方案快了近3倍。我做错了什么,或者Eigenlib的表现是否仅仅因为这些小规模问题而劣势?

示例:

int main()
{
    random_device rd;
    mt19937 mt(rd());
    uniform_real_distribution<double> dist(1.0, 10.0);

    std::chrono::nanoseconds total{};
    const uint64_t max_loops = 10000000;
    std::vector<double> arr(max_loops);

    for (uint64_t i = 0; i < max_loops; i++)
    {
        Vec<double> x1;
        x1 << dist(mt), dist(mt);

        Vec<double> x2;
        x2 << dist(mt), dist(mt);

        Mat<double> D;
        D << dist(mt), dist(mt), dist(mt), dist(mt);

        auto start = chrono::high_resolution_clock::now();
        arr[i] = quadraticMetricNorm(x1, x2, D);
        total += chrono::duration_cast<std::chrono::nanoseconds>(chrono::high_resolution_clock::now() - start);
    }

    cout << "Loop took " << total.count() / 1000000 << "ms" << endl;

    return 0;
}

第一个版本的循环耗时约为850毫秒,而计算机上的第二个版本约为440毫秒。

我在Visual Studio Community 2017上运行Intel i7-5820K CPU,在释放模式下激活AVX2(最大化速度(/ O2))。

提前感谢您的帮助!

修改

自请求以来,这是两个程序集。速度越快:

addsd   xmm1, xmm0
mulsd   xmm1, xmm10
mulsd   xmm12, xmm7
mulsd   xmm13, xmm9
addsd   xmm12, xmm13
mulsd   xmm12, xmm11
addsd   xmm1, xmm12

Eigenlib生成的代码要复杂得多(没有AVX2):

lea r8, QWORD PTR D$11[rbp-256]
lea rdx, QWORD PTR $T7[rbp-256]
lea rcx, QWORD PTR $T3[rsp]
call    ??$?DV?$Matrix@N$01$01$0A@$01$01@Eigen@@@?$MatrixBase@V?$Transpose@$$CBV?$Matrix@N$01$00$0A@$01$00@Eigen@@@Eigen@@@Eigen@@QEBA?BV?$Product@V?$Transpose@$$CBV?$Matrix@N$01$00$0A@$01$00@Eigen@@@Eigen@@V?$Matrix@N$01$01$0A@$01$01@2@$0A@@1@AEBV?$MatrixBase@V?$Matrix@N$01$01$0A@$01$01@Eigen@@@1@@Z ; Eigen::MatrixBase<Eigen::Transpose<Eigen::Matrix<double,2,1,0,2,1> const > >::operator*<Eigen::Matrix<double,2,2,0,2,2> >
mov rcx, rax
lea r8, QWORD PTR x2$9[rbp-256]
lea rdx, QWORD PTR $T5[rsp]
call    ??$?DV?$Matrix@N$01$00$0A@$01$00@Eigen@@@?$MatrixBase@V?$Product@V?$Transpose@$$CBV?$Matrix@N$01$00$0A@$01$00@Eigen@@@Eigen@@V?$Matrix@N$01$01$0A@$01$01@2@$0A@@Eigen@@@Eigen@@QEBA?BV?$Product@V?$Product@V?$Transpose@$$CBV?$Matrix@N$01$00$0A@$01$00@Eigen@@@Eigen@@V?$Matrix@N$01$01$0A@$01$01@2@$0A@@Eigen@@V?$Matrix@N$01$00$0A@$01$00@2@$0A@@1@AEBV?$MatrixBase@V?$Matrix@N$01$00$0A@$01$00@Eigen@@@1@@Z ; Eigen::MatrixBase<Eigen::Product<Eigen::Transpose<Eigen::Matrix<double,2,1,0,2,1> const >,Eigen::Matrix<double,2,2,0,2,2>,0> >::operator*<Eigen::Matrix<double,2,1,0,2,1> >
; File ...\eigenlib\eigen\src\core\coreevaluators.h

; 155  :     : m_data(m.data()), m_outerStride(IsVectorAtCompileTime ? 0 : m.outerStride()) 

lea rax, QWORD PTR $T10[rbp-240]
mov QWORD PTR $T10[rbp-256], rax
; File ...\eigenlib\eigen\src\core\cwisebinaryop.h

; 105  :       : m_lhs(aLhs), m_rhs(aRhs), m_functor(func)

movups  xmm0, XMMWORD PTR $T5[rsp]
movups  XMMWORD PTR $T6[rsp+8], xmm0
mov rax, QWORD PTR $T5[rsp+16]
mov QWORD PTR $T6[rbp-232], rax
; File ...\eigenlib\eigen\src\core\redux.h

; 453  :   return derived().redux(Eigen::internal::scalar_sum_op<Scalar,Scalar>());

lea rdx, QWORD PTR $T1[rsp]
lea rcx, QWORD PTR $T6[rsp]
call    ??$redux@U?$scalar_sum_op@NN@internal@Eigen@@@?$DenseBase@V?$CwiseBinaryOp@U?$scalar_product_op@NN@internal@Eigen@@$$CBV?$Transpose@$$CBV?$Product@V?$Transpose@$$CBV?$Matrix@N$01$00$0A@$01$00@Eigen@@@Eigen@@V?$Matrix@N$01$01$0A@$01$01@2@$0A@@Eigen@@@3@$$CBV?$Matrix@N$01$00$0A@$01$00@3@@Eigen@@@Eigen@@QEBANAEBU?$scalar_sum_op@NN@internal@1@@Z ; Eigen::DenseBase<Eigen::CwiseBinaryOp<Eigen::internal::scalar_product_op<double,double>,Eigen::Transpose<Eigen::Product<Eigen::Transpose<Eigen::Matrix<double,2,1,0,2,1> const >,Eigen::Matrix<double,2,2,0,2,2>,0> const > const ,Eigen::Matrix<double,2,1,0,2,1> const > >::redux<Eigen::internal::scalar_sum_op<double,double> >
movsd   QWORD PTR $T10[rbp-240], xmm0

所以Visual Studio和Eigenlib根本不能很好地融合在一起吗?

0 个答案:

没有答案