嗯,代码是,问题是为什么AVX版本比天真的版本更慢?
const double __declspec(align(16)) mx[4] = { 1., 1., 1., -100.};
const double __declspec(align(16)) an[8] = { 8., 7., 6., 5., 4., 3., 2., 1.};
__forceinline double x_1(const double a, const double b, const double c, const double e)
{
if(a < e)
{
if (b < a)
{
if (c < b)
{
return a + e;
}
else
{
return b + e;
}
}
else
{
if (c < b)
{
return c + e;
}
else
{
return a + e;
}
}
}
else
{
if (b < a)
{
if (c < b)
{
return b + e;
}
else
{
return c + e;
}
}
else
{
if (c < b)
{
return a + e;
}
else
{
return b + e;
}
}
}
}
union un {
double __declspec(align(16)) m[4] = { 0., 1., 2., 3. };
__m256d x;
};
un v;
__forceinline double x_2()
{
_mm256_zeroupper();
return an[_mm256_movemask_pd(_mm256_cmp_pd(v.x, _mm256_load_pd(mx), _CMP_LT_OS))];
}
int main()
{
{
double a = 0.;
double b = 1.;
double c = 2.;
double r = 0.;
std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < (1 << 30); ++i)
{
r = x_1(a, b, c, 1.);
std::swap(a, b);
std::swap(b, c);
}
std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
std::cout << "NAIVE " << std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count() << " : " << r << "\n";
}
{
double r = 0.;
std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < (1 << 30) ++i)
{
r = x_2();
std::swap(v.m[0], v.m[1]);
std::swap(v.m[1], v.m[2]);
}
std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
std::cout << "AVX : " << std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count() << " : " << r;
}
return 0;
}
我使用Full Optimize和/ arch AVX在MSVC上编译它,但即使只是调用_mm256_vzeroupper也比x_1 func慢...为什么?当我需要使用'比较内在函数'时,它比原生代码更快?
我的x_1()时间 - 2.5秒,x_2() - 11.7秒