Question

我正在从头开始编写一个机器学习库（在CPU上）。它涉及浮动向量的大量操作。我的目标是建立一个非常高效和快速的库。为此，我确定了一个关于我需要的向量的核心操作列表：元素明智的加法，元素明智的乘法，点积，元素明智的sqrt / pow / exp。我的机器学习库的性能在很大程度上取决于这些操作的性能。

我知道float上的向量操作的性能需要SIMD，指令流水线，缓存行优化以及浮动IEEE标准检查的删除（意味着删除非正常值，NaN / 0+ / 0-检查等）。

有许多c ++库具有我需要的向量核心操作，并且它们经常利用BLAS（或openBLAS）进行SIMD操作。我使用犰狳很长一段时间，没有那么多的基准测试和与其他库的比较。但是在对我的程序进行一些分析之后，我发现该程序在openblas库中花费的时间可以忽略不计。然后，我决定对不同库实现的性能进行基准测试，并与基于std::vector的简单实现进行比较：

矢量大小 - 10 000浮点数，操作次数 - 1 000 000

元素添加

std::vector - 2s:477ms:183μs:102ns
armadillo - 2s:452ms:828μs:409ns
blaze - 2s:404ms:917μs:747ns
VCL(SSE2) - 1s:957ms:42μs:608ns

元素乘法

std::vector - 2s:438ms:7μs:792ns
armadillo - 2s:433ms:195μs:322ns
blaze - 2s:415ms:86μs:600ns
VCL(SSE2) - 1s:927ms:301μs:214ns

标量/点/内积

std::vector - 2s:681ms:335μs:990ns
armadillo - 2s:600ms:441μs:415ns
blaze - 2s:289ms:95μs:894ns
VCL(SSE2) - 3s:485ms:100μs:644ns

比较图书馆列表

犰狳（+ openBLAS）：http://arma.sourceforge.net/

Blaze（+ openBLAS）：https://bitbucket.org/blaze-lib/blaze

VCL：http://www.agner.org/optimize/

令人惊讶的是，天真的实施与Armadillo和Blaze一样快（差异可以忽略不计）。对于元素加法和乘法，VCL比其他实现快约23％。这主要归功于GCC的PGO（Profile Guided Optimizations），没有它，它具有相同的性能。

所以，我的问题是：

似乎编译器已经使用SIMD和指令流水线优化了天真的实现。线性代数库是否真的可以在这些核心操作上做得更好？（如果需要，我可以使用其他库基准更新帖子。）
我的基准测试方法是否适合这项任务？（代码如下）

基准代码 - vec-bench.cpp：

#include <armadillo>
#include <blaze/Math.h>
#include <vcl/vectorclass.h>
#include <chrono>
#include <string>
#include <sstream>
#include <iostream>
#include <vector>

// -----------------------BENCHMARK-FUNCTIONS---------------------------
typedef std::chrono::duration<int, std::ratio<86400>> days;

#define BENCHMARK_START \
{   \
    std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();

#define BENCHMARK_END(name, itCount) \
    std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); \
    std::cout << "Benchmark for " << name << ":\n  " << hreadableUnit(std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1)) \
                        << " total\n  " << hreadableUnit(std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1) / itCount) \
                        << " average per iteration\n\n"; \
}

std::string hreadableUnit(std::chrono::nanoseconds ns) {
  auto d = std::chrono::duration_cast<days>(ns);
  ns -= d;
  auto h = std::chrono::duration_cast<std::chrono::hours>(ns);
  ns -= h;
  auto m = std::chrono::duration_cast<std::chrono::minutes>(ns);
  ns -= m;
  auto s = std::chrono::duration_cast<std::chrono::seconds>(ns);
  ns -= s;
  auto millis = std::chrono::duration_cast<std::chrono::milliseconds>(ns);
  ns -= millis;
  auto micros = std::chrono::duration_cast<std::chrono::microseconds>(ns);
  ns -= micros;

  std::stringstream ss;
  bool evenZero = false;
  if (d.count() || evenZero) {
    ss << d.count() << "d:";
    evenZero = true;
  }
  if (h.count() || evenZero) {
    ss << h.count() << "h:";
    evenZero = true;
  }
  if (m.count() || evenZero) {
    ss << m.count() << "m:";
    evenZero = true;
  }
  if (s.count() || evenZero) {
    ss << s.count() << "s:";
    evenZero = true;
  }
  if (millis.count() || evenZero) {
    ss << millis.count() << "ms:";
    evenZero = true;
  }
  if (micros.count() || evenZero) {
    ss << micros.count() << "μs:";
    evenZero = true;
  }
  if (ns.count() || evenZero) {
    ss << ns.count() << "ns";
    evenZero = true;
  }

  if (!evenZero)
    ss << "N/A";

  return ss.str();
}


// ----------------------ACTUAL-PROCESSING------------------------------
int main() {
    /* vector size, number of iterations */
    const unsigned int vSize = 10000, itCount = 1000000;
    float sum;

    /* std */
    std::vector<float> v(vSize), u(vSize);

    /* armadillo */
    arma::fvec av(vSize, arma::fill::randu), au(vSize, arma::fill::randu);
    unsigned int i, j;

    /* blaze */
    blaze::StaticVector<float, vSize, blaze::columnVector> bv, bu;

    /* SSE2 VCL (my processor cannot do better than SSE2) */
    unsigned int sse2VSize = vSize / 4, reminder = vSize % 4; 

    std::vector<Vec4f> vclv(sse2VSize), vclu(sse2VSize);
    std::vector<float> vclrv(reminder), vclru(reminder); /* reminder */

    std::cout << "----------------ELEMENT-WISE-ADDITION-----------------\n";
    BENCHMARK_START
        for (i = 0; i < itCount; ++i)
            for (j = 0; j < vSize; ++j)
                v[j] += u[j];
    BENCHMARK_END("standard vector addition", itCount)

    BENCHMARK_START
        for (i = 0; i < itCount; ++i)
            av += au;
    BENCHMARK_END("armadillo vector addition", itCount)

    BENCHMARK_START
        for (i = 0; i < itCount; ++i)
            bv += bu;
    BENCHMARK_END("blaze vector addition", itCount)

    BENCHMARK_START
        for (i = 0; i < itCount; ++i)
            for (j = 0; j < sse2VSize; ++j)
                vclv[j] += vclu[j];

        if (reminder) {
            for (i = 0; i < itCount; ++i)
                for (j = 0; j < reminder; ++j)
                    vclrv[j] += vclru[j];
        }
    BENCHMARK_END("VCL(SSE2 - 4floats) vector addition", itCount)


    std::cout << "-------------ELEMENT-WISE-MULTIPLICATION--------------\n";
    BENCHMARK_START
        for (i = 0; i < itCount; ++i)
            for (j = 0; j < vSize; ++j)
                v[j] *= u[j];
    BENCHMARK_END("standard vector element wise multiplication", itCount)

    BENCHMARK_START
        for (i = 0; i < itCount; ++i)
            av %= au;
    BENCHMARK_END("armadillo vector element wise multiplication", itCount)

    BENCHMARK_START
        for (i = 0; i < itCount; ++i)
            bv *= bu;
    BENCHMARK_END("blaze vector element wise multiplication", itCount)

    BENCHMARK_START
        for (i = 0; i < itCount; ++i)
            for (j = 0; j < sse2VSize; ++j)
                vclv[j] *= vclu[j];

        if (reminder) {
            for (i = 0; i < itCount; ++i)
                for (j = 0; j < reminder; ++j)
                    vclrv[j] *= vclru[j];
        }
    BENCHMARK_END("VCL(SSE2 - 4floats) element wise multiplication", itCount)


    std::cout << "------------SCALAR / DOT / INNER PRODUCT-------------\n";
    BENCHMARK_START
        sum = 0;
        for (i = 0; i < itCount; ++i)
            for (j = 0; j < vSize; ++j)
                sum += (v[j] * u[j]);

        /* force the compiler to not remove 
         * the calculation by using the 'sum' 
         * variable */
        std::cout << sum << "\n";
    BENCHMARK_END("standard vector scalar / dot / inner product", itCount)

    BENCHMARK_START
        auto aut = au.t(); /* transposed */
        sum = 0;
        for (i = 0; i < itCount; ++i)
            sum += as_scalar(av * aut);
        std::cout << sum << "\n";
    BENCHMARK_END("armadillo vector scalar / dot / inner product", itCount)

    BENCHMARK_START
        blaze::StaticVector<float, vSize, blaze::rowVector> but = blaze::trans(bu); /* transposed */
        sum = 0;
        for (i = 0; i < itCount; ++i)
            sum += blaze::dotu(bv, but);
        std::cout << sum << "\n";
    BENCHMARK_END("blaze vector scalar / dot / inner product", itCount)

    BENCHMARK_START
        sum = 0;
        for (i = 0; i < itCount; ++i)
            for (j = 0; j < sse2VSize; ++j)
                sum += horizontal_add((vclv[j] * vclu[j]));

        if (reminder) {
            for (i = 0; i < itCount; ++i)
                for (j = 0; j < reminder; ++j)
                    sum += (vclrv[j] * vclru[j]);
        }
        std::cout << sum << "\n";
    BENCHMARK_END("VCL(SSE2 - 4floats) scalar / dot / inner product", itCount)

    return 0;
}

基准编译：

g++ -DARMA_DONT_USE_WRAPPER -DARMA_USE_BLAS -DNDEBUG -DARMA_NO_DEBUG -march=native -O3 -ffast-math -std=c++14 -o vec-bench vec-bench.cpp -lopenblas -fprofile-use

修改

Oprofile数据：

元素添加

armadillo: 99.6462% -> void arma::arrayops::inplace_plus_base<float>(float*, float const*, unsigned long long)
blaze: 99.7094% -> main (meaning all blaze functions are inlined and it never go into openblas)

元素乘法

armadillo: 99.6596% -> void arma::arrayops::inplace_mul_base<float>(float*, float const*, unsigned long long)
blaze: 99.6650% -> main (same as above)

标量产品

armadillo: 84.3580% -> /usr/lib/libopenblasp-r0.2.18.so
blaze: 84.8001% -> /usr/lib/libopenblasp-r0.2.18.so

所以犰狳和大火只使用BLAS作为点积。

关于向量/矩阵/线性代数库的性能（c ++）

0 个答案: