Question

我在blitz ++，armadillo，boost :: MultiArray之间用以下代码进行了比较（借鉴an old post）

#include <iostream>
using namespace std;
#include <windows.h>
#define _SCL_SECURE_NO_WARNINGS
#define BOOST_DISABLE_ASSERTS 
#include <boost/multi_array.hpp>
#include <blitz/array.h>
#include <armadillo>

int main(int argc, char* argv[])
{
    const int X_SIZE = 1000;
    const int Y_SIZE = 1000;
    const int ITERATIONS = 100;
    unsigned int startTime = 0;
    unsigned int endTime = 0;

    // Create the boost array


    //------------------Measure boost Loop------------------------------------------
    {
        typedef boost::multi_array<double, 2> ImageArrayType;
        ImageArrayType boostMatrix(boost::extents[X_SIZE][Y_SIZE]);
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int x = 0; x < X_SIZE; ++x)
            {
                for (int y = 0; y < Y_SIZE; ++y)
                {
                    boostMatrix[x][y] = 1.0001;
                }
            }
        }
        endTime = ::GetTickCount();
        printf("[Boost Loop] Elapsed time: %6.3f seconds\n", (endTime - startTime) / 1000.0);
    }
    //------------------Measure blitz Loop-------------------------------------------
    {
        blitz::Array<double, 2> blitzArray( X_SIZE, Y_SIZE );
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int x = 0; x < X_SIZE; ++x)
            {
                for (int y = 0; y < Y_SIZE; ++y)
                {
                    blitzArray(x,y) = 1.0001;
                }
            }
        }
        endTime = ::GetTickCount();
        printf("[Blitz Loop] Elapsed time: %6.3f seconds\n", (endTime - startTime) / 1000.0);
    }

    //------------------Measure armadillo loop----------------------------------------
    {
        arma::mat matArray( X_SIZE, Y_SIZE );
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int y = 0; y < Y_SIZE; ++y)
            {
                for (int x = 0; x < X_SIZE; ++x)
                {
                    matArray(x,y) = 1.0001;
                }
            }
        }
        endTime = ::GetTickCount();
        printf("[arma  Loop]  Elapsed time: %6.3f seconds\n", (endTime - startTime) / 1000.0);
    }

    //------------------Measure native loop----------------------------------------
    // Create the native array
    {
        double *nativeMatrix = new double [X_SIZE * Y_SIZE];
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int y = 0; y < Y_SIZE*X_SIZE; ++y)
            {
                nativeMatrix[y] = 1.0001;
            }
        }
        endTime = ::GetTickCount();
        printf("[Native Loop]Elapsed time: %6.3f seconds\n", (endTime - startTime) / 1000.0);
        delete[] nativeMatrix;
    }

    //------------------Measure boost computation-----------------------------------
    {
        typedef boost::multi_array<double, 2> ImageArrayType;
        ImageArrayType boostMatrix(boost::extents[X_SIZE][Y_SIZE]);
        for (int x = 0; x < X_SIZE; ++x)
        {
            for (int y = 0; y < Y_SIZE; ++y)
            {
                boostMatrix[x][y] = 1.0001;
            }
        }
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int x = 0; x < X_SIZE; ++x)
            {
                for (int y = 0; y < Y_SIZE; ++y)
                {
                    boostMatrix[x][y] += boostMatrix[x][y] * 0.5;
                }
            }
        }
        endTime = ::GetTickCount();
        printf("[Boost computation] Elapsed time: %6.3f seconds\n", (endTime - startTime) / 1000.0);
    }

    //------------------Measure blitz computation-----------------------------------
    {
        blitz::Array<double, 2> blitzArray( X_SIZE, Y_SIZE );
        blitzArray = 1.0001;
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            blitzArray += blitzArray*0.5;
        }
        endTime = ::GetTickCount();
        printf("[Blitz computation] Elapsed time: %6.3f seconds\n", (endTime - startTime) / 1000.0);
    }

    //------------------Measure armadillo computation-------------------------------
    {
        arma::mat matArray( X_SIZE, Y_SIZE );
        matArray.fill(1.0001);
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            //matArray.fill(1.0001);
            matArray += matArray*0.5;
        }
        endTime = ::GetTickCount();
        printf("[arma  computation] Elapsed time: %6.3f seconds\n", (endTime - startTime) / 1000.0);
    }

    //------------------Measure native computation------------------------------------------
    // Create the native array
    {
        double *nativeMatrix = new double [X_SIZE * Y_SIZE];
        for (int y = 0; y < Y_SIZE*X_SIZE; ++y)
        {
            nativeMatrix[y] = 1.0001;
        }
        startTime = ::GetTickCount();
        for (int i = 0; i < ITERATIONS; ++i)
        {
            for (int y = 0; y < Y_SIZE*X_SIZE; ++y)
            {
                nativeMatrix[y] += nativeMatrix[y] * 0.5;
            }
        }
        endTime = ::GetTickCount();
        printf("[Native computation]Elapsed time: %6.3f seconds\n", (endTime - startTime) / 1000.0);
        delete[] nativeMatrix;
    }

    return 0;
}

在Windows上，VS2010，结果是

[Boost Loop] Elapsed time:  1.217 seconds
[Blitz Loop] Elapsed time:  0.046 seconds
[arma  Loop]  Elapsed time:  0.078 seconds
[Native Loop]Elapsed time:  0.172 seconds
[Boost computation] Elapsed time:  2.152 seconds
[Blitz computation] Elapsed time:  0.156 seconds
[arma  computation] Elapsed time:  0.078 seconds
[Native computation]Elapsed time:  0.078 seconds

在Windows上，intel c ++，结果是

[Boost Loop] Elapsed time:  0.468 seconds
[Blitz Loop] Elapsed time:  0.125 seconds
[arma  Loop]  Elapsed time:  0.046 seconds
[Native Loop]Elapsed time:  0.047 seconds
[Boost computation] Elapsed time:  0.796 seconds
[Blitz computation] Elapsed time:  0.109 seconds
[arma  computation] Elapsed time:  0.078 seconds
[Native computation]Elapsed time:  0.062 seconds

奇怪的是：

(1) with VS2010, native computation (including loop) is faster than native loop
(2) blitz loop behave so different under VS2010 and intel C++.

要使用intel c ++编译器编译blitz ++，blitz / intel /文件夹中需要一个名为bzconfig.h的文件。但事实并非如此。我只是复制了blitz / ms / bzconfig.h中的那个。这可能会给出一个非最佳配置。谁能告诉我如何使用intel c ++编译器编译blitz ++？在手册中，它说运行bzconfig脚本来获取正确的bzconfig.h。但我不明白这意味着什么。

非常感谢！

添加我的一些结论：

1. Boost multi array is the slowest.
2. With intel c++ compiler, native pointers are very fast.
3. With intel c++ compiler,  armadillo can achieve the performance of native pointers.
4. Also test eigen, it is x0% slower than armadillo in my simple cases.
5. Curious about blitz++'s behavior in intel c++ compiler with proper configuration.
   Please see my question.

Answer 1

简短回答：./configure CXX=icpc，通过阅读Blitz ++用户指南找到。

答案很长：

要使用intel c ++编译器编译blitz ++，blitz / intel /文件夹中需要一个名为bzconfig.h的文件。但没有。

是的，是的。 Blitz ++应该生成文件本身。根据{{1}}中包含的Blitz ++用户指南blitz.pdf，“安装”部分，

Blitz ++使用GNU Autoconf，它处理各种平台和编译器的Makefile重写。

更准确地说，Blitz ++使用GNU autotools工具链（automake，autoconf，configure），它可以生成makefile，配置脚本，头文件等等。 blitz-0.10.tar.gz文件应由bzconfig.h脚本生成，该脚本随Blitz ++一起提供，随时可用。

我只是复制了blitz / ms / bzconfig.h中的那个。这可能会给出一个非最佳配置。

如果“非最佳”对您意味着“不工作”，那么是。 :-) 您需要一个能够准确表示编译器的configure。

任何人都可以告诉我如何使用intel c ++编译器编译blitz ++？

阅读并遵循精细手册，特别是上面提到的“安装”部分。

进入'blitz-VERSION'目录，然后输入： intel/bzconfig.h 其中[编译器]是xlc ++，icpc，pathCC，xlC，cxx，aCC，CC，g ++，KCC，pgCC或FCC之一。（如果不选择C ++编译器，configure脚本将尝试为当前平台查找适当的编译器。）

你做到了吗？对于英特尔编译器，您需要使用 ./configure CXX=[compiler]。

在手册中，它说运行bzconfig脚本来获取正确的bzconfig.h。但我不明白这意味着什么。

我认为“它”的意思是“那个”。 “手动”是什么意思？我的Blitz ++用户指南副本没有提到./configure CXX=icpc。您确定使用的是与Blitz ++版本相对应的手册吗？

PS：在blitz-0.10的内容中寻找“bzconfig”，看起来“bzconfig”不再是Blitz ++的一部分，但过去是：

bzconfig - ＆gt;没有结果

find . -name bzconfig：

find . -print0 | xargs -0 grep -a -i -n -e bzconfig

需要更新。

./blitz/compiler.h:44:    #error  In <blitz/config.h>: A working template implementation is required by Blitz++ (you may need to rerun the compiler/bzconfig script)

如果有的话，这些./blitz/gnu/bzconfig.h:4:/* blitz/gnu/bzconfig.h. Generated automatically at end of configure. */ ./configure.ac:159:# autoconf replacement of bzconfig文件应由bzconfig.h生成。

configure

这可能是切换到autoconf的变化。

./ChangeLog.1:1787: will now replace the old file that was generate with the bzconfig

需要更新。这是什么让你寻找./INSTALL:107: 2. Go into the compiler subdirectory and run the bzconfig？

bzconfig

需要更新，不再包含./README:27:compiler Compiler tests (used with obsolete bzconfig script)目录。

Answer 2

据我所知，您通过测量单个矩阵乘以标量的速度来判断每个矩阵库的性能。由于其基于模板的策略，Armadillo将通过将每个乘法分解为大多数编译器的可并行化代码来做得很好。

但我建议您需要重新考虑您的测试范围和方法。例如，您遗漏了每个BLAS实现。您需要的BLAS功能是dscal。供应商为您的特定CPU提供的实现可能会做得很好。

更相关的是，任何合理的矢量库都需要做的事情还有很多：矩阵乘法，点积，矢量长度，转置等，这些都是你的测试无法解决的。您的测试完全解决了两件事：元素赋值，实际上从来不是矢量库的瓶颈，以及标量/向量乘法，这是每个CPU制造商提供的BLAS 1级函数。

讨论了BLAS级别1与编译器发出的代码here。

TL：博士; use Armadillo with BLAS and LAPACK native libraries linked in for your platform

Answer 3

我的测试表明，boost数组具有与本机/硬编码C ++代码相同的性能。

您需要使用激活的编译器优化来比较它们。那是： -O3 -DNDEBUG -DBOOST_UBLAS_NDEBUG -DBOOST_DISABLE_ASSERTS -DARMA_NO_DEBUG ... 当我测试（em ++）时，Boost在停用其断言时执行速度至少快10倍，使用-O3启用3级优化等。任何公平比较都应使用这些标志。

比较blitz ++，armadillo，boost :: MultiArray

3 个答案: