我正在从头开始编写一个机器学习库(在CPU上)。它涉及浮动向量的大量操作。我的目标是建立一个非常高效和快速的库。为此,我确定了一个关于我需要的向量的核心操作列表:元素明智的加法,元素明智的乘法,点积,元素明智的sqrt / pow / exp。我的机器学习库的性能在很大程度上取决于这些操作的性能。
我知道float上的向量操作的性能需要SIMD,指令流水线,缓存行优化以及浮动IEEE标准检查的删除(意味着删除非正常值,NaN / 0+ / 0-检查等)。
有许多c ++库具有我需要的向量核心操作,并且它们经常利用BLAS(或openBLAS)进行SIMD操作。我使用犰狳很长一段时间,没有那么多的基准测试和与其他库的比较。但是在对我的程序进行一些分析之后,我发现该程序在openblas库中花费的时间可以忽略不计。然后,我决定对不同库实现的性能进行基准测试,并与基于std::vector
的简单实现进行比较:
矢量大小 - 10 000浮点数,操作次数 - 1 000 000
元素添加
std::vector - 2s:477ms:183μs:102ns
armadillo - 2s:452ms:828μs:409ns
blaze - 2s:404ms:917μs:747ns
VCL(SSE2) - 1s:957ms:42μs:608ns
元素乘法
std::vector - 2s:438ms:7μs:792ns
armadillo - 2s:433ms:195μs:322ns
blaze - 2s:415ms:86μs:600ns
VCL(SSE2) - 1s:927ms:301μs:214ns
标量/点/内积
std::vector - 2s:681ms:335μs:990ns
armadillo - 2s:600ms:441μs:415ns
blaze - 2s:289ms:95μs:894ns
VCL(SSE2) - 3s:485ms:100μs:644ns
比较图书馆列表
犰狳(+ openBLAS):http://arma.sourceforge.net/
Blaze(+ openBLAS):https://bitbucket.org/blaze-lib/blaze
VCL:http://www.agner.org/optimize/
令人惊讶的是,天真的实施与Armadillo和Blaze一样快(差异可以忽略不计)。对于元素加法和乘法,VCL比其他实现快约23%。这主要归功于GCC的PGO(Profile Guided Optimizations),没有它,它具有相同的性能。
所以,我的问题是:
似乎编译器已经使用SIMD和指令流水线优化了天真的实现。线性代数库是否真的可以在这些核心操作上做得更好? (如果需要,我可以使用其他库基准更新帖子。)
我的基准测试方法是否适合这项任务? (代码如下)
基准代码 - vec-bench.cpp
:
#include <armadillo>
#include <blaze/Math.h>
#include <vcl/vectorclass.h>
#include <chrono>
#include <string>
#include <sstream>
#include <iostream>
#include <vector>
// -----------------------BENCHMARK-FUNCTIONS---------------------------
typedef std::chrono::duration<int, std::ratio<86400>> days;
#define BENCHMARK_START \
{ \
std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
#define BENCHMARK_END(name, itCount) \
std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); \
std::cout << "Benchmark for " << name << ":\n " << hreadableUnit(std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1)) \
<< " total\n " << hreadableUnit(std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1) / itCount) \
<< " average per iteration\n\n"; \
}
std::string hreadableUnit(std::chrono::nanoseconds ns) {
auto d = std::chrono::duration_cast<days>(ns);
ns -= d;
auto h = std::chrono::duration_cast<std::chrono::hours>(ns);
ns -= h;
auto m = std::chrono::duration_cast<std::chrono::minutes>(ns);
ns -= m;
auto s = std::chrono::duration_cast<std::chrono::seconds>(ns);
ns -= s;
auto millis = std::chrono::duration_cast<std::chrono::milliseconds>(ns);
ns -= millis;
auto micros = std::chrono::duration_cast<std::chrono::microseconds>(ns);
ns -= micros;
std::stringstream ss;
bool evenZero = false;
if (d.count() || evenZero) {
ss << d.count() << "d:";
evenZero = true;
}
if (h.count() || evenZero) {
ss << h.count() << "h:";
evenZero = true;
}
if (m.count() || evenZero) {
ss << m.count() << "m:";
evenZero = true;
}
if (s.count() || evenZero) {
ss << s.count() << "s:";
evenZero = true;
}
if (millis.count() || evenZero) {
ss << millis.count() << "ms:";
evenZero = true;
}
if (micros.count() || evenZero) {
ss << micros.count() << "μs:";
evenZero = true;
}
if (ns.count() || evenZero) {
ss << ns.count() << "ns";
evenZero = true;
}
if (!evenZero)
ss << "N/A";
return ss.str();
}
// ----------------------ACTUAL-PROCESSING------------------------------
int main() {
/* vector size, number of iterations */
const unsigned int vSize = 10000, itCount = 1000000;
float sum;
/* std */
std::vector<float> v(vSize), u(vSize);
/* armadillo */
arma::fvec av(vSize, arma::fill::randu), au(vSize, arma::fill::randu);
unsigned int i, j;
/* blaze */
blaze::StaticVector<float, vSize, blaze::columnVector> bv, bu;
/* SSE2 VCL (my processor cannot do better than SSE2) */
unsigned int sse2VSize = vSize / 4, reminder = vSize % 4;
std::vector<Vec4f> vclv(sse2VSize), vclu(sse2VSize);
std::vector<float> vclrv(reminder), vclru(reminder); /* reminder */
std::cout << "----------------ELEMENT-WISE-ADDITION-----------------\n";
BENCHMARK_START
for (i = 0; i < itCount; ++i)
for (j = 0; j < vSize; ++j)
v[j] += u[j];
BENCHMARK_END("standard vector addition", itCount)
BENCHMARK_START
for (i = 0; i < itCount; ++i)
av += au;
BENCHMARK_END("armadillo vector addition", itCount)
BENCHMARK_START
for (i = 0; i < itCount; ++i)
bv += bu;
BENCHMARK_END("blaze vector addition", itCount)
BENCHMARK_START
for (i = 0; i < itCount; ++i)
for (j = 0; j < sse2VSize; ++j)
vclv[j] += vclu[j];
if (reminder) {
for (i = 0; i < itCount; ++i)
for (j = 0; j < reminder; ++j)
vclrv[j] += vclru[j];
}
BENCHMARK_END("VCL(SSE2 - 4floats) vector addition", itCount)
std::cout << "-------------ELEMENT-WISE-MULTIPLICATION--------------\n";
BENCHMARK_START
for (i = 0; i < itCount; ++i)
for (j = 0; j < vSize; ++j)
v[j] *= u[j];
BENCHMARK_END("standard vector element wise multiplication", itCount)
BENCHMARK_START
for (i = 0; i < itCount; ++i)
av %= au;
BENCHMARK_END("armadillo vector element wise multiplication", itCount)
BENCHMARK_START
for (i = 0; i < itCount; ++i)
bv *= bu;
BENCHMARK_END("blaze vector element wise multiplication", itCount)
BENCHMARK_START
for (i = 0; i < itCount; ++i)
for (j = 0; j < sse2VSize; ++j)
vclv[j] *= vclu[j];
if (reminder) {
for (i = 0; i < itCount; ++i)
for (j = 0; j < reminder; ++j)
vclrv[j] *= vclru[j];
}
BENCHMARK_END("VCL(SSE2 - 4floats) element wise multiplication", itCount)
std::cout << "------------SCALAR / DOT / INNER PRODUCT-------------\n";
BENCHMARK_START
sum = 0;
for (i = 0; i < itCount; ++i)
for (j = 0; j < vSize; ++j)
sum += (v[j] * u[j]);
/* force the compiler to not remove
* the calculation by using the 'sum'
* variable */
std::cout << sum << "\n";
BENCHMARK_END("standard vector scalar / dot / inner product", itCount)
BENCHMARK_START
auto aut = au.t(); /* transposed */
sum = 0;
for (i = 0; i < itCount; ++i)
sum += as_scalar(av * aut);
std::cout << sum << "\n";
BENCHMARK_END("armadillo vector scalar / dot / inner product", itCount)
BENCHMARK_START
blaze::StaticVector<float, vSize, blaze::rowVector> but = blaze::trans(bu); /* transposed */
sum = 0;
for (i = 0; i < itCount; ++i)
sum += blaze::dotu(bv, but);
std::cout << sum << "\n";
BENCHMARK_END("blaze vector scalar / dot / inner product", itCount)
BENCHMARK_START
sum = 0;
for (i = 0; i < itCount; ++i)
for (j = 0; j < sse2VSize; ++j)
sum += horizontal_add((vclv[j] * vclu[j]));
if (reminder) {
for (i = 0; i < itCount; ++i)
for (j = 0; j < reminder; ++j)
sum += (vclrv[j] * vclru[j]);
}
std::cout << sum << "\n";
BENCHMARK_END("VCL(SSE2 - 4floats) scalar / dot / inner product", itCount)
return 0;
}
基准编译:
g++ -DARMA_DONT_USE_WRAPPER -DARMA_USE_BLAS -DNDEBUG -DARMA_NO_DEBUG -march=native -O3 -ffast-math -std=c++14 -o vec-bench vec-bench.cpp -lopenblas -fprofile-use
修改
Oprofile数据:
元素添加
armadillo: 99.6462% -> void arma::arrayops::inplace_plus_base<float>(float*, float const*, unsigned long long)
blaze: 99.7094% -> main (meaning all blaze functions are inlined and it never go into openblas)
元素乘法
armadillo: 99.6596% -> void arma::arrayops::inplace_mul_base<float>(float*, float const*, unsigned long long)
blaze: 99.6650% -> main (same as above)
标量产品
armadillo: 84.3580% -> /usr/lib/libopenblasp-r0.2.18.so
blaze: 84.8001% -> /usr/lib/libopenblasp-r0.2.18.so
所以犰狳和大火只使用BLAS作为点积。