我只是想知道两种情况之间是否存在差异
第一个代码是,如果外部用于计算行,而内部用于计算cols
第二个代码是,如果外部用于计数cols,而内部用于计算行
我应用这两个并获得相同的结果
答案 0 :(得分:1)
你会得到相同的结果,但你可能会得到不同的表现。
矩阵最有可能以行主顺序存储并按行访问它可能会获得更好的内存带宽和$利用率。试着为巨大的矩阵做同样的事情并测量壁垒时间。
答案 1 :(得分:1)
以下是如何计时。首先,我的结果。报告的数字是10000次试验的平均CPU时钟周期。
$ clang++ -Ofast -DNDEBUG -std=c++11 mat_add.cpp -o mat_add && ./mat_add
avg. cycles 100x100 doubles matrix addition
strided: 60149
sequential: 27137
$ g++-4.9 -Ofast -DNDEBUG -std=c++11 mat_add.cpp -o mat_add && ./mat_add
avg. cycles 100x100 doubles matrix addition
strided: 90517
sequential: 33407
顺序访问速度更快。原因是缓存行为,特别是缓存行。这是关于这个主题的有趣读物。
http://igoro.com/archive/gallery-of-processor-cache-effects/
我区分跨步和顺序而不是行和列,因为行和列是任意的。通常在C ++中,我们认为顺序元素在同一行中,但这纯粹是约定,并不是语言中固有的。不同的图书馆遵循不同的惯例。
测试代码。
// timing
// http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
#include <stdlib.h>
uint64_t start, stop;
unsigned cycles_high;
unsigned cycles_low;
unsigned cycles_high1;
unsigned cycles_low1;
unsigned ellapsed_cycles;
static inline void start_count()
{
asm volatile(
"CPUID\n\t"
"RDTSC\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t"
: "=r" (cycles_high), "=r" (cycles_low)
:
: "%rax", "%rbx", "%rcx", "%rdx");
}
static inline void stop_count()
{
asm volatile(
"RDTSCP\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t"
"CPUID\n\t"
: "=r" (cycles_high1), "=r" (cycles_low1)
:
: "%rax", "%rbx", "%rcx", "%rdx");
start = ( ((uint64_t)cycles_high << 32) | cycles_low );
stop = ( ((uint64_t)cycles_high1 << 32) | cycles_low1 );
ellapsed_cycles = stop - start;
}
// matrix addition
#include <cstddef>
#include <memory>
#include <vector>
#include <iostream>
#include <cassert>
#include <random>
using std::size_t;
template<class T>
class Matrix
{
public:
Matrix(const size_t n, const size_t m)
: elems_(new T[n*m]), n_(n), m_(m)
{}
Matrix(const size_t n, const size_t m, const std::vector< std::vector<T> >& elems)
: elems_(new T[n*m]), n_(n), m_(m)
{
assert(n != 0 && m != 0);
for (size_t i = 0; i != n_; ++i)
{
for (size_t j = 0; j != m_; ++j)
{
std::cout << "elems[" << n << ", " << j << "] = " << elems[i][j] << std::endl;
elems_[i*n + j] = elems[i][j];
}
}
}
~Matrix()
{
delete[] elems_;
};
T& operator()(const size_t i, size_t j)
{
assert(i < n_ && j < m_);
return elems_[i*m_ + j];
}
const T& operator()(const size_t i, size_t j) const
{
assert(i < n_ && j < m_);
return elems_[i*m_ + j];
}
friend std::ostream& operator<<(std::ostream& os, const Matrix& mat)
{
size_t i = 0;
size_t j = 0;
os << "[ ";
goto first;
for (; i != mat.n_; ++i)
{
os << " ";
first:
for (j = 0; j != mat.m_; ++j)
{
// make it all pretty and nice
os << std::fixed;
os.width(6);
os.precision(2);
os << mat(i, j) << " ";
}
if (i != mat.n_ - 1)
os << "\n";
}
os << " ]";
return os;
}
size_t n() const { return n_; }
size_t m() const { return m_; }
private:
T* elems_;
const size_t n_;
const size_t m_;
};
template<class T>
Matrix<T> add_sequential(const Matrix<T>& mat1, const Matrix<T>& mat2)
{
assert(mat1.n() == mat2.n() && mat1.m() == mat2.m());
const size_t n = mat1.n();
const size_t m = mat1.m();
Matrix<T> sum(n, m);
for (size_t i = 0; i != n; ++i)
{
for (size_t j = 0; j != m; ++j)
{
sum(i, j) = mat1(i, j) + mat2(i, j);
}
}
return sum;
}
template<class T>
Matrix<T> add_strided(const Matrix<T>& mat1, const Matrix<T>& mat2)
{
assert(mat1.n() == mat2.n() && mat1.m() == mat2.m());
const size_t n = mat1.n();
const size_t m = mat1.m();
Matrix<T> sum(n, m);
for (size_t j = 0; j != m; ++j)
{
for (size_t i = 0; i != n; ++i)
{
sum(i, j) = mat1(i, j) + mat2(i, j);
}
}
return sum;
}
// misc: making random matrices, flushing cache, running timing tests
template<class T>
Matrix<T> rand_real_mat(const size_t n, const size_t m)
{
static std::default_random_engine gen;
static std::uniform_real_distribution<T> dis(-100.0, 100.0);
Matrix<T> mat(n, m);
for (size_t j = 0; j != m; ++j)
{
for (size_t i = 0; i != n; ++i)
{
mat(i, j) = dis(gen);
}
}
return mat;
}
#include <fstream>
void flush_cache()
{
std::ifstream rand("/dev/random", std::ifstream::binary);
std::ofstream devnull("/dev/null", std::ofstream::binary);
for (size_t i = 0; i != (30 * 1024 / sizeof(int)); ++i)
{
int r;
rand >> r;
devnull << r;
}
}
template<class R, class ElemType>
static inline std::vector<uint64_t> time_mat_fnc(
R (fnc)(const Matrix<ElemType>&, const Matrix<ElemType>&),
const size_t n_times)
{
std::ofstream devnull("/dev/null", std::ofstream::binary);
std::vector<uint64_t> times;
times.reserve(n_times);
static const size_t n = 100;
static const size_t m = 100;
for (size_t i = 0; i != 1000; ++i)
{
// create 2 random n x m matrices
const auto m1 = rand_real_mat<ElemType>(n, m);
const auto m2 = rand_real_mat<ElemType>(n, m);
flush_cache();
// addition
start_count();
const auto sum = fnc(m1, m2);
stop_count();
times.push_back(ellapsed_cycles);
// prevent optimizing away unused result
devnull << sum;
}
return times;
}
template<typename C>
decltype(std::declval<typename C::value_type>()/std::declval<typename C::value_type>())
average(const C& cntnr)
{
typename C::value_type sum = 0;
size_t size = 0;
for (typename C::const_iterator it = cntnr.begin(), end = cntnr.end(); it != end; ++it)
{
sum += *it;
++size;
}
return sum / size;
}
int main()
{
typedef double ElemType;
const size_t trials = 10000;
const std::vector<uint64_t> strided_times = time_mat_fnc(add_strided<ElemType>, trials);
const std::vector<uint64_t> sequential_times = time_mat_fnc(add_sequential<ElemType>, trials);
const auto strided_avg = average(strided_times);
const auto sequential_avg = average(sequential_times);
std::cout << "avg. cycles 100x100 doubles matrix addition" << std::endl;
std::cout << "strided: " << strided_avg << std::endl;
std::cout << "sequential: " << sequential_avg << std::endl;
}