我正在创建一些函数来执行诸如负数和正数之间的“分离总和”,kahan,成对和其他东西,其中我从矩阵中获取元素的顺序无关紧要,例如:< / p>
template <typename T, int R, int C>
inline T sum(const Eigen::Matrix<T,R,C>& xs)
{
T sumP(0);
T sumN(0);
for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nRows; ++i)
for (size_t j = 0; j < nCols; ++j)
{
if (xs(i,j)>0)
sumP += xs(i,j);
else if (xs(i,j)<0) //ignore 0 elements: improvement for sparse matrices I think
sumN += xs(i,j);
}
return sumP+sumN;
}
现在,我想尽可能提高效率,所以我的问题是,如上所述循环遍历每一行的每一列会更好,或者像下面那样执行相反的操作:
for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nCols; ++i)
for (size_t j = 0; j < nRows; ++j)
(我想这取决于矩阵元素在内存中的分配顺序,但我在Eigen的手册中找不到这个。)
此外,还有其他替代方法,比如使用迭代器(它们存在于Eigen中吗?)可能会稍快一点吗?
答案 0 :(得分:16)
我做了一些基准测试来检查哪种方式更快,我得到了以下结果(以秒为单位):
12
30
3
6
23
3
第一行是按照@jleahy的建议进行迭代。
第二行是迭代,就像我在问题中的代码中所做的那样(@jleahy的逆序)。
第三行是使用PlainObjectBase::data()
进行迭代,如for (int i = 0; i < matrixObject.size(); i++)
。
其他3行重复与上述相同,但有一个临时的@ lucas92
我也做过相同的测试,但是使用替换/ if else。* / for / else /(对稀疏矩阵没有特殊处理),我得到以下(在几秒钟内):
10
27
3
6
24
2
再次进行测试给我的结果非常相似。我将g++ 4.7.3
与-O3
一起使用。代码:
#include <ctime>
#include <iostream>
#include <Eigen/Dense>
using namespace std;
template <typename T, int R, int C>
inline T sum_kahan1(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nCols; ++i)
for (size_t j = 0; j < nRows; ++j)
{
if (xs(j,i)>0)
{
yP = xs(j,i) - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else if (xs(j,i)<0)
{
yN = xs(j,i) - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan2(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nRows; ++i)
for (size_t j = 0; j < nCols; ++j)
{
if (xs(i,j)>0)
{
yP = xs(i,j) - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else if (xs(i,j)<0)
{
yN = xs(i,j) - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan3(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, size = xs.size(); i < size; i++)
{
if ((*(xs.data() + i))>0)
{
yP = (*(xs.data() + i)) - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else if ((*(xs.data() + i))<0)
{
yN = (*(xs.data() + i)) - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan1t(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nCols; ++i)
for (size_t j = 0; j < nRows; ++j)
{
T temporary = xs(j,i);
if (temporary>0)
{
yP = temporary - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else if (temporary<0)
{
yN = temporary - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan2t(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nRows; ++i)
for (size_t j = 0; j < nCols; ++j)
{
T temporary = xs(i,j);
if (temporary>0)
{
yP = temporary - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else if (temporary<0)
{
yN = temporary - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan3t(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, size = xs.size(); i < size; i++)
{
T temporary = (*(xs.data() + i));
if (temporary>0)
{
yP = temporary - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else if (temporary<0)
{
yN = temporary - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan1e(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nCols; ++i)
for (size_t j = 0; j < nRows; ++j)
{
if (xs(j,i)>0)
{
yP = xs(j,i) - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else
{
yN = xs(j,i) - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan2e(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nRows; ++i)
for (size_t j = 0; j < nCols; ++j)
{
if (xs(i,j)>0)
{
yP = xs(i,j) - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else
{
yN = xs(i,j) - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan3e(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, size = xs.size(); i < size; i++)
{
if ((*(xs.data() + i))>0)
{
yP = (*(xs.data() + i)) - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else
{
yN = (*(xs.data() + i)) - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan1te(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nCols; ++i)
for (size_t j = 0; j < nRows; ++j)
{
T temporary = xs(j,i);
if (temporary>0)
{
yP = temporary - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else
{
yN = temporary - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan2te(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nRows; ++i)
for (size_t j = 0; j < nCols; ++j)
{
T temporary = xs(i,j);
if (temporary>0)
{
yP = temporary - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else
{
yN = temporary - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan3te(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, size = xs.size(); i < size; i++)
{
T temporary = (*(xs.data() + i));
if (temporary>0)
{
yP = temporary - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else
{
yN = temporary - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
int main() {
Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic> test = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>::Random(10000,10000);
cout << "start" << endl;
int now;
now = time(0);
sum_kahan1(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan2(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan3(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan1t(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan2t(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan3t(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan1e(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan2e(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan3e(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan1te(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan2te(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan3te(test);
cout << time(0) - now << endl;
return 0;
}
答案 1 :(得分:7)
默认情况下,Eigen以列主(Fortran)顺序分配矩阵(documentation)。
迭代矩阵的最快方法是按存储顺序执行,以错误的方式执行此操作会增加缓存未命中数(如果矩阵不适合L1,则会占用您的计算时间,因此读取会增加您的计算时间)乘以cacheline / elemsize(可能是64/8 = 8)。
如果您的矩阵适合L1缓存,这将没有什么区别,但一个好的编译器应该能够矢量化循环,启用AVX(在一个闪亮的新核心i7上)可以给你一个加速四次。 (256位/ 64位)。
最后不要指望任何Eigen的内置函数能给你加速(我不认为有任何迭代器,但我可能会弄错),它们只会给你相同的(非常简单)代码。
TLDR:交换你的迭代顺序,你需要最快地改变行索引。
答案 2 :(得分:3)
我注意到代码相当于矩阵中所有条目的总和,即你可以这样做:
return xs.sum();
我认为这会表现得更好,因为它只有一次通过,而且Eigen应该&#34;知道&#34;如何安排通行证以获得最佳表现。
但是,如果你想保留两个通道,你可以使用系数减少机制表达这一点,如下所示:
return (xs.array() > 0).select(xs, 0).sum() +
(xs.array() < 0).select(xs, 0).sum();
使用布尔系数方式选择来选择正和负条目。我不知道它是否会胜过手卷循环,但理论上编码方式允许Eigen(和编译器)更多地了解你的意图,并可能改善结果。
答案 3 :(得分:1)
尝试将xs(i,j)存储在循环内的临时变量中,这样你只需要调用一次函数。