Question

我正在使用大型矩阵（100x100到3000x3000）进行一些计算（很多总和和120次矩阵向量乘法），我正在使用特征库来处理我的向量和矩阵。

我想知道如何加速我的计划。我应该继续使用Eigen，使用1d数组，使用std :: vector，还是使用其他库？

Answer 1

假设您不想迁移到GPU，并且如果您想要信任Eigen的benchkmark页面，那么Eigen非常快。您特别提到matrix vector products，在您指定的范围内，Eigen位于顶部。确保启用OpenMP，因为Eigen将利用multiple cores。与vectorization一样。

Answer 2

我做了一次比较，比较了Eigen和ViennaCL（两者都在调试中）：

Processing Unit | Mat Size | Exec time |    Calc        | Sparse %
CPU                   10       0,000     dense_mat*mat
GPU                   10       0,010     dense_mat*mat
CPU                  100       0,103     dense_mat*mat
GPU                  100       0,001     dense_mat*mat
CPU                 1000      97,232     dense_mat*mat
GPU                 1000       0,072     dense_mat*mat
CPU                   10       0,000     sparse_mat*mat   0,25
GPU                   10       0,007     sparse_mat*mat   0,25
CPU                   10       0,000     sparse_mat*mat   0,5
GPU                   10       0,000     sparse_mat*mat   0,5
CPU                   10       0,000     sparse_mat*mat   1
GPU                   10       0,000     sparse_mat*mat   1
CPU                  100       0,010     sparse_mat*mat   0,25
GPU                  100       0,001     sparse_mat*mat   0,25
CPU                  100       0,030     sparse_mat*mat   0,5
GPU                  100       0,001     sparse_mat*mat   0,5
CPU                  100       0,106     sparse_mat*mat   1
GPU                  100       0,001     sparse_mat*mat   1
CPU                 1000       7,131     sparse_mat*mat   0,25
GPU                 1000       0,073     sparse_mat*mat   0,25
CPU                 1000      26,628     sparse_mat*mat   0,5
GPU                 1000       0,072     sparse_mat*mat   0,5
CPU                 1000     101,389     sparse_mat*mat   1
GPU                 1000       0,072     sparse_mat*mat   1

使用的代码是：

//disable debug mechanisms to have a fair comparison with ublas:
#ifndef NDEBUG
#define NDEBUG
#endif


//
// include necessary system headers
//
#include <iostream>
#include <fstream>

//
// ublas includes
//
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/triangular.hpp>
#include <boost/numeric/ublas/matrix_sparse.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/matrix_proxy.hpp>
#include <boost/numeric/ublas/lu.hpp>
#include <boost/numeric/ublas/io.hpp>

#define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET
//
// Eigen includes
//
#include <Eigen/Core>
#include <Eigen/Dense>

// Must be set if you want to use ViennaCL algorithms on ublas objects
#define VIENNACL_WITH_UBLAS 1

//
// IMPORTANT: Must be set prior to any ViennaCL includes if you want to use ViennaCL algorithms on Eigen objects
//
#define VIENNACL_WITH_EIGEN 1
//
// IMPORTANT: Must be set prior to any ViennaCL includes if you want to use ViennaCL algorithms using OPENCL
//
#ifndef VIENNACL_WITH_OPENCL
#define VIENNACL_WITH_OPENCL
#endif

//
// ViennaCL includes
//
#include "viennacl/scalar.hpp"
#include "viennacl/vector.hpp"
#include "viennacl/matrix.hpp"
#include "viennacl/linalg/prod.hpp"
#include "viennacl/compressed_matrix.hpp"
#include "viennacl/linalg/sparse_matrix_operations.hpp"
#include "viennacl/linalg/ilu.hpp"
#include "viennacl/linalg/detail/ilu/block_ilu.hpp"
#include "viennacl/linalg/direct_solve.hpp"
#include "viennacl/linalg/bicgstab.hpp"

// Some helper functions for this tutorial:
#include "examples/tutorial/Random.hpp"
#include "examples/tutorial/vector-io.hpp"

#include "examples/benchmarks/benchmark-utils.hpp"

#define BLAS3_MATRIX_SIZE   700

using namespace boost::numeric;


#ifndef VIENNACL_WITH_OPENCL
struct dummy
{
    std::size_t size() const { return 1; }
};
#endif


int main()
{
    typedef float     ScalarType;

    Timer timer;
    double exec_time;

    //
    // Initialize OpenCL device in the context
    //
    std::cout << std::endl << "--- initialized OpenGL device using ViennaCL ---" << std::endl;
#ifdef VIENNACL_WITH_OPENCL
    std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
#else
    dummy devices;
#endif

#ifdef VIENNACL_WITH_OPENCL
    viennacl::ocl::current_context().switch_device(devices[0]);
    std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
#endif

    //// Output results file
    std::ofstream resultsFile;
    resultsFile.open("resultsFile.txt");
    resultsFile << "Processing Unit,Mat Size,Exec time" << std::endl;
    std::cout << "Processing Unit,Mat Size,Exec time" << std::endl;


    // Start defining the dense matrices
    size_t points = 230000;
    size_t transRows=4;
    size_t transCols=3;
    // Other alternative: Use Eigen
    Eigen::MatrixXf eigen_A(points, transRows);
    Eigen::MatrixXf eigen_B(transRows, transCols);
    // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
    std::vector<ScalarType> stl_A(points * transRows);
    std::vector<ScalarType> stl_B(transRows * transCols);
    // Set up the ViennaCL object
    viennacl::matrix<ScalarType> vcl_A(points, transRows);
    viennacl::matrix<ScalarType> vcl_B(transRows, transCols);
    // Fill dense matrix in normal memory
    for (unsigned int i = 0; i < points; ++i)
    {
        for (unsigned int j = 0; j < transRows; ++j)
        {
            stl_A[i*transRows + j] = random<ScalarType>();
            eigen_A(i,j) = stl_A[i*transRows + j];
        }
    }
    for (unsigned int i = 0; i < transRows; ++i)
    {
        for (unsigned int j = 0; j < transCols; ++j)
        {
            stl_B[i*transCols + j] = random<ScalarType>();
            eigen_B(i,j) = stl_A[i*transCols + j];
        }
    }


    // Perform the matrix*matrix product
    // On CPU
    Eigen::MatrixXf eigen_C(points, transCols);
    timer.start();
    eigen_C = eigen_A * eigen_B;
    exec_time = timer.get();
    resultsFile << "CPU," << points << "," << exec_time << std::endl;
    std::cout << "CPU," << points << "," << exec_time << std::endl;

    // on GPU
    timer.start();
    // Copy to gpu memory
    // Using fastcopy I can get ~500x speed improvement
    viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);
    viennacl::fast_copy(&(stl_B[0]),&(stl_B[0]) + stl_B.size(),vcl_B);

    viennacl::matrix<ScalarType> vcl_C(points, transCols);
    vcl_C = viennacl::linalg::prod(vcl_A, vcl_B);
    viennacl::backend::finish();

    exec_time = timer.get();
    resultsFile << "GPU," << points << "," << exec_time << std::endl;
    std::cout << "GPU," << points << "," << exec_time << std::endl;

    //// Start defining the dense matrices
    //for(size_t denseSize=10; denseSize<=1000; denseSize=denseSize*10)
    //{
    //  // Other alternative: Use Eigen
    //  Eigen::MatrixXf eigen_A(denseSize, denseSize);
    //  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
    //  std::vector<ScalarType> stl_A(denseSize * denseSize);
    //  // Set up the ViennaCL object
    //  viennacl::matrix<ScalarType> vcl_A(denseSize, denseSize);
    //  // Fill dense matrix in normal memory
    //  for (unsigned int i = 0; i < denseSize; ++i)
    //  {
    //      for (unsigned int j = 0; j < denseSize; ++j)
    //      {
    //          stl_A[i*denseSize + j] = random<ScalarType>();
    //          eigen_A(i,j) = stl_A[i*denseSize + j];
    //      }
    //  }


    //  // Perform the matrix*matrix product
    //  // On CPU
    //  Eigen::MatrixXf eigen_C(denseSize, denseSize);
    //  timer.start();
    //  eigen_C = eigen_A * eigen_A;
    //  exec_time = timer.get();
    //  resultsFile << "CPU," << denseSize << "," << exec_time << std::endl;
    //  std::cout << "CPU," << denseSize << "," << exec_time << std::endl;

    //  // on GPU
    //  timer.start();
    //  // Copy to gpu memory
    //  // Using fastcopy I can get ~500x speed improvement
    //  viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);

    //  viennacl::matrix<ScalarType> vcl_C(denseSize, denseSize);
    //  vcl_C = viennacl::linalg::prod(vcl_A, vcl_A);
    //  viennacl::backend::finish();

    //  exec_time = timer.get();
    //  resultsFile << "GPU," << denseSize << "," << exec_time << std::endl;
    //  std::cout << "GPU," << denseSize << "," << exec_time << std::endl;
    //}

    //// Start defining the sparse matrices
    //for(size_t sparseSize=10; sparseSize<=1000; sparseSize=sparseSize*10)
    //{
    //  for(float sparsePerc=0.25; sparsePerc<=1.0; sparsePerc=2*sparsePerc)
    //  {
    //      // Other alternative: Use Eigen
    //      Eigen::SparseMatrix<float> eigen_A(sparseSize, sparseSize);
    //      // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
    //      std::vector<ScalarType> stl_A(sparseSize * sparseSize);
    //      // Set up the ViennaCL sparse matrix
    //      viennacl::matrix<ScalarType> vcl_A(sparseSize, sparseSize);
    //      // Fill dense matrix in normal memory
    //      for (size_t i=0; i<sparseSize; ++i)
    //      {
    //          for (size_t j=0; j<sparseSize; ++j)
    //          {
    //              if (((rand()%100)/100.0) <= sparsePerc)
    //              {
    //                  stl_A[i*sparseSize + j] = float(rand());
    //                  eigen_A.insert(i,j) = stl_A[i*sparseSize + j];
    //              }
    //          }
    //      }


    //      // Perform the matrix*matrix product
    //      // On CPU
    //      Eigen::SparseMatrix<float> eigen_C(sparseSize, sparseSize);
    //      timer.start();
    //      eigen_C = (eigen_A * eigen_A).pruned();
    //      exec_time = timer.get();
    //      resultsFile << "CPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
    //      std::cout << "CPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;

    //      // on GPU
    //      timer.start();
    //      // Copy to gpu memory
    //      // Using fastcopy I can get ~500x speed improvement
    //      viennacl::fast_copy(&(stl_A[0]),&(stl_A[0]) + stl_A.size(),vcl_A);

    //      viennacl::matrix<ScalarType> vcl_C(sparseSize, sparseSize);
    //      vcl_C = viennacl::linalg::prod(vcl_A, vcl_A);
    //      viennacl::backend::finish();

    //      exec_time = timer.get();
    //      resultsFile << "GPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
    //      std::cout << "GPU," << sparseSize << "," << sparsePerc << "," << exec_time << std::endl;
    //  }
    //}

    resultsFile.close();


    //
    //  That's it. 
    //

    std::cout << "Press [ENTER] to exit " << std::endl;
    std::cin.get();

    return EXIT_SUCCESS;
}

1d阵列比特征动态矢量更快吗？

2 个答案: