我有以下测试功能:
std::pair<double, double> test_speed_vmv(const size_t size)
{
const size_t rounds = 10000;
arma::colvec a = arma::colvec(size, arma::fill::randn);
arma::rowvec b = arma::rowvec(size, arma::fill::randn);
arma::colvec b1 = arma::colvec(size);
arma::colvec c = arma::colvec(size, arma::fill::zeros);
arma::colvec d = arma::colvec(size, arma::fill::zeros);
arma::colvec e = arma::colvec(size, arma::fill::zeros);
arma::mat A = arma::mat(size, size, arma::fill::ones);
for(size_t i = 0; i < size; ++i)
b1[i] = b[i];
std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
for(size_t j = 0; j < rounds; ++j)
c = A * a;
std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
std::chrono::high_resolution_clock::time_point t3 = std::chrono::high_resolution_clock::now();
for(size_t j = 0; j < rounds; ++j)
{
e = A * a;
}
std::chrono::high_resolution_clock::time_point t4 = std::chrono::high_resolution_clock::now();
for(size_t i = 0; i < size; ++i)
std::cout << c[i] << '\t' << e[i] << '\n';
auto duration_avx = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count()/rounds;
auto duration_arma = std::chrono::duration_cast<std::chrono::microseconds>(t4 - t3).count()/rounds;
if(arma::approx_equal(c, e, "absdiff", 1e-3) == 1)
return std::make_pair<double, double>(duration_avx, duration_arma);
else
return std::make_pair<double, double>(-1, -1);
}
从main调用
#include <iostream>
#include <immintrin.h>
#include <omp.h>
#include <armadillo>
#include <chrono>
#include <mkl.h>
int main(void)
{
std::vector<double> matrix_sizes = {2, 5, 10, 80, 95, 96, 99, 100, 128, 256, 512, 1000/*, 1024, 2048, 2500*/};
std::vector<std::pair<double, double>> results, results_Z, results_Z_blas, matrix_results;
for(size_t i = 0; i < matrix_sizes.size(); ++i)
matrix_results.push_back(test_speed_vmv(matrix_sizes[i]));
for(size_t i = 0; i < matrix_sizes.size(); ++i)
std::cout << "Multiplication of a matrix with a vector with a length of " << matrix_sizes[i] << " took " << std::get<0>(matrix_results[i]) << " for a single line and " << std::get<1>(matrix_results[i]) << " for two lines, resulting in a single line being "
<< std::get<1>(matrix_results[i])/(std::get<0>(matrix_results[i]) == 0?1:std::get<0>(matrix_results[i])) << " times faster\n";
std::cout << "|-----------------------------------------|\n";
return 0;
}
现在我希望为每个矢量长度和矩阵大小获得相同的时间和相同的值。但不是我得到了
Multiplication of a matrix with a vector with a length of 2 took 0 for a single line and 0 for two lines, resulting in a single line being 0 times faster
Multiplication of a matrix with a vector with a length of 5 took 39 for a single line and 0 for two lines, resulting in a single line being 0 times faster
Multiplication of a matrix with a vector with a length of 10 took 0 for a single line and 0 for two lines, resulting in a single line being 0 times faster
Multiplication of a matrix with a vector with a length of 80 took 2250 for a single line and 1 for two lines, resulting in a single line being 0.000444444 times faster
Multiplication of a matrix with a vector with a length of 95 took 1 for a single line and 1 for two lines, resulting in a single line being 1 times faster
Multiplication of a matrix with a vector with a length of 96 took -1 for a single line and -1 for two lines, resulting in a single line being 1 times faster
Multiplication of a matrix with a vector with a length of 99 took -1 for a single line and -1 for two lines, resulting in a single line being 1 times faster
Multiplication of a matrix with a vector with a length of 100 took -1 for a single line and -1 for two lines, resulting in a single line being 1 times faster
Multiplication of a matrix with a vector with a length of 128 took -1 for a single line and -1 for two lines, resulting in a single line being 1 times faster
Multiplication of a matrix with a vector with a length of 256 took -1 for a single line and -1 for two lines, resulting in a single line being 1 times faster
Multiplication of a matrix with a vector with a length of 512 took -1 for a single line and -1 for two lines, resulting in a single line being 1 times faster
Multiplication of a matrix with a vector with a length of 1000 took -1 for a single line and -1 for two lines, resulting in a single line being 1 times faster
几次运行的时间是一致的。但是,为什么我对长于96个元素的向量得到不同的结果呢?
使用
完成编译g++ -I -O2 -ftree-vectorize -mavx2 -funroll-loops -g -march=native -std=gnu++17 -fopenmp -c avx2_test.cpp -o avx2_test.o
g++ -lm -larmadillo -lgomp -lpthread -lX11 -L/opt/boost/lib -lboost_system -L/opt/intel/mkl/lib/intel64 -lmkl_rt avx2_test.o -o avx2