Question

我有以下测试功能：

std::pair<double, double> test_speed_vmv(const size_t size)
{
    const size_t rounds = 10000;
    arma::colvec a = arma::colvec(size, arma::fill::randn);
    arma::rowvec b = arma::rowvec(size, arma::fill::randn);
    arma::colvec b1 = arma::colvec(size);
    arma::colvec c = arma::colvec(size, arma::fill::zeros);
    arma::colvec d = arma::colvec(size, arma::fill::zeros);
    arma::colvec e = arma::colvec(size, arma::fill::zeros);
    arma::mat A = arma::mat(size, size, arma::fill::ones);
    for(size_t i = 0; i < size; ++i)
        b1[i] = b[i];
    std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
    for(size_t j = 0; j < rounds; ++j)
        c = A * a;
    std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
    std::chrono::high_resolution_clock::time_point t3 = std::chrono::high_resolution_clock::now();
    for(size_t j = 0; j < rounds; ++j)
    {
        e = A * a;
    }
    std::chrono::high_resolution_clock::time_point t4 = std::chrono::high_resolution_clock::now();
    for(size_t i = 0; i < size; ++i)
        std::cout << c[i] << '\t' << e[i] << '\n';
    auto duration_avx = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count()/rounds;
    auto duration_arma = std::chrono::duration_cast<std::chrono::microseconds>(t4 - t3).count()/rounds;
    if(arma::approx_equal(c, e, "absdiff", 1e-3) == 1)
        return std::make_pair<double, double>(duration_avx, duration_arma);
    else
        return std::make_pair<double, double>(-1, -1);
}

从main调用

#include <iostream>
#include <immintrin.h>
#include <omp.h>
#include <armadillo>
#include <chrono>
#include <mkl.h>
int main(void)
{
    std::vector<double> matrix_sizes = {2, 5, 10, 80, 95, 96, 99, 100, 128, 256, 512, 1000/*, 1024, 2048, 2500*/};
    std::vector<std::pair<double, double>> results, results_Z, results_Z_blas, matrix_results;

    for(size_t i = 0; i < matrix_sizes.size(); ++i)
        matrix_results.push_back(test_speed_vmv(matrix_sizes[i]));
    for(size_t i = 0; i < matrix_sizes.size(); ++i)
        std::cout << "Multiplication of a matrix with a vector with a length of " << matrix_sizes[i] << " took " << std::get<0>(matrix_results[i]) << " for a single line and " << std::get<1>(matrix_results[i]) << " for two lines, resulting in a single line being "
                  << std::get<1>(matrix_results[i])/(std::get<0>(matrix_results[i]) == 0?1:std::get<0>(matrix_results[i])) << " times faster\n";
    std::cout << "|-----------------------------------------|\n";

    return 0;
}

现在我希望为每个矢量长度和矩阵大小获得相同的时间和相同的值。但不是我得到了

Multiplication of a matrix with a vector with a length of 2 took 0 for a single line and 0 for two lines, resulting in a single line being 0 times faster
Multiplication of a matrix with a vector with a length of 5 took 39 for a single line and 0 for two lines, resulting in a single line being 0 times faster
Multiplication of a matrix with a vector with a length of 10 took 0 for a single line and 0 for two lines, resulting in a single line being 0 times faster
Multiplication of a matrix with a vector with a length of 80 took 2250 for a single line and 1 for two lines, resulting in a single line being 0.000444444 times faster
Multiplication of a matrix with a vector with a length of 95 took 1 for a single line and 1 for two lines, resulting in a single line being 1 times faster
Multiplication of a matrix with a vector with a length of 96 took -1 for a single line and -1 for two lines, resulting in a single line being 1 times faster
Multiplication of a matrix with a vector with a length of 99 took -1 for a single line and -1 for two lines, resulting in a single line being 1 times faster
Multiplication of a matrix with a vector with a length of 100 took -1 for a single line and -1 for two lines, resulting in a single line being 1 times faster
Multiplication of a matrix with a vector with a length of 128 took -1 for a single line and -1 for two lines, resulting in a single line being 1 times faster
Multiplication of a matrix with a vector with a length of 256 took -1 for a single line and -1 for two lines, resulting in a single line being 1 times faster
Multiplication of a matrix with a vector with a length of 512 took -1 for a single line and -1 for two lines, resulting in a single line being 1 times faster
Multiplication of a matrix with a vector with a length of 1000 took -1 for a single line and -1 for two lines, resulting in a single line being 1 times faster

几次运行的时间是一致的。但是，为什么我对长于96个元素的向量得到不同的结果呢？

使用

完成编译

g++ -I -O2 -ftree-vectorize -mavx2 -funroll-loops -g -march=native -std=gnu++17 -fopenmp -c avx2_test.cpp -o avx2_test.o
g++ -lm -larmadillo -lgomp -lpthread -lX11 -L/opt/boost/lib -lboost_system -L/opt/intel/mkl/lib/intel64 -lmkl_rt avx2_test.o -o avx2

使用armadillo向量运行相同的计算两次会得到不同的结果

0 个答案: