正如标题所述,我有一个自定义的UnaryView函数,该函数在不同的操作中被调用多次(主要是与其他矩阵相乘)。例如:
MatrixXd out = mat1.cview() * mat2;
MatrixXd out2 = mat1.cview() * mat3.transpose();
首先将自定义视图复制到一个单独的矩阵中并使用它会更快吗?例如:
MatrixXd mat1_dbl = mat1.cview();
MatrixXd out = mat1_dbl * mat2;
MatrixXd out2 = mat1_dbl * mat3.transpose();
基本上,重复使用UnaryView比复制到矩阵并使用它要慢吗?
答案 0 :(得分:0)
应该做自己的基准测试。 Google基准测试表明,首先复制的速度明显更快:
2019-04-09 20:55:55
Running ./CView
Run on (16 X 4053.06 MHz CPU s)
CPU Caches:
L1 Data 32K (x8)
L1 Instruction 64K (x8)
L2 Unified 512K (x8)
L3 Unified 8192K (x2)
--------------------------------------------------------
Benchmark Time CPU Iterations
--------------------------------------------------------
UnaryView_Repeat 147390919 ns 147385796 ns 5
UnaryView_Copy 139456051 ns 139451409 ns 5
经过测试:
#include <stan/math/prim/mat/fun/Eigen.hpp>
#include <stan/math/fwd/mat.hpp>
#include <stan/math/fwd/core.hpp>
#include <benchmark/benchmark.h>
static void UnaryView_Repeat(benchmark::State& state) {
using Eigen::MatrixXd;
using stan::math::matrix_fd;
matrix_fd m_fd1(1000, 1000);
m_fd1.val_() = MatrixXd::Random(1000, 1000);
m_fd1.d_() = MatrixXd::Random(1000, 1000);
MatrixXd m_d2 = MatrixXd::Random(1000, 1000);
for (auto _ : state) {
MatrixXd out(1000,1000);
out = m_fd1.val_() * m_d2
+ m_fd1.val_().transpose() * m_d2
+ m_fd1.val_().array().exp().matrix();
}
}
BENCHMARK(UnaryView_Repeat);
static void UnaryView_Copy(benchmark::State& state) {
using Eigen::MatrixXd;
using stan::math::matrix_fd;
matrix_fd m_fd1(1000, 1000);
m_fd1.val_() = MatrixXd::Random(1000, 1000);
m_fd1.d_() = MatrixXd::Random(1000, 1000);
MatrixXd m_d2 = MatrixXd::Random(1000, 1000);
for (auto _ : state) {
MatrixXd out(1000,1000);
MatrixXd m_fd1_val = m_fd1.val_();
out = m_fd1_val * m_d2 + m_fd1_val.transpose() * m_d2
+ m_fd1_val.array().exp().matrix();
}
}
BENCHMARK(UnaryView_Copy);
BENCHMARK_MAIN();