为什么这个AVX代码这么慢?

时间:2017-09-29 14:02:51

标签: c++ performance sse simd avx

嗯,代码是,问题是为什么AVX版本比天真的版本更慢?

const double __declspec(align(16)) mx[4] = { 1., 1., 1., -100.};
const double __declspec(align(16)) an[8] = { 8., 7., 6., 5., 4., 3., 2., 1.};

__forceinline double x_1(const double a, const double b, const double c, const double e)
{
    if(a < e)
    {
        if (b < a)
        {
            if (c < b)
            {
                return a + e;
            }
            else
            {
                return b + e;
            }
        }
        else
        {
            if (c < b)
            {
                return c + e;
            }
            else
            {
                return a + e;
            }
        }
    }
    else
    {
        if (b < a)
        {
            if (c < b)
            {
                return b + e;
            }
            else
            {
                return c + e;
            }
        }
        else
        {
            if (c < b)
            {
                return a + e;
            }
            else
            {
                return b + e;
            }
        }
    }
}

union un {
    double __declspec(align(16)) m[4] = { 0., 1., 2., 3. };
    __m256d x;
};

un v;

__forceinline double x_2()
{
    _mm256_zeroupper();
    return an[_mm256_movemask_pd(_mm256_cmp_pd(v.x, _mm256_load_pd(mx), _CMP_LT_OS))];
}

int main()
{
    {
        double a = 0.;
        double b = 1.;
        double c = 2.;

        double r = 0.;

        std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
        for (int i = 0; i < (1 << 30); ++i)
        {
            r = x_1(a, b, c, 1.);
            std::swap(a, b);
            std::swap(b, c);
        }
        std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
        std::cout << "NAIVE " << std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count() << " : " << r << "\n";
    }

    {
        double r = 0.;
        std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
        for (int i = 0; i < (1 << 30)  ++i)
        {
            r = x_2();
            std::swap(v.m[0], v.m[1]);
            std::swap(v.m[1], v.m[2]);
        }
        std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
        std::cout << "AVX : " << std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count() << " : " << r;
    }
    return 0;
}

我使用Full Optimize和/ arch AVX在MSVC上编译它,但即使只是调用_mm256_vzeroupper也比x_1 func慢...为什么?当我需要使用'比较内在函数'时,它比原生代码更快?

我的x_1()时间 - 2.5秒,x_2() - 11.7秒

0 个答案:

没有答案