我已经从http://gruntthepeon.free.fr/ssemath/重写了对数函数,以用于double和AVX2。
但是,整个功能比常规C / C ++版本(1.1s)慢14倍(15s)。当我注释所有使用_mm256_sub_pd
的ines时,AVX2的运行速度(300毫秒)比常规C / C ++版本快3倍。怎么了?当我重写sincos并将exp转换为AVX时,没有问题,并且相对于常规C / C ++版本,速度提高了3倍-5倍。
#include <immintrin.h> //AVX2
#define PD_CONST(Name, Val) \
static const alignas(32) double _pd_##Name[4] = { Val, Val, Val, Val }
#define PI_CONST_64(Name, Val) \
static const alignas(32) uint64_t _i64_##Name[4] = { Val, Val, Val, Val }
#define PI_CONST_32(Name, Val) \
static const alignas(32) uint32_t _i32_##Name[4] = { Val, Val, Val, Val }
//constants of size 64bit
PD_CONST(1, 1.0f);
PD_CONST(0p5, 0.5f);
PD_CONST(cephes_SQRTHF, 0.707106781186547524);
PD_CONST(cephes_log_p0, 7.0376836292E-2);
PD_CONST(cephes_log_p1, -1.1514610310E-1);
PD_CONST(cephes_log_p2, 1.1676998740E-1);
PD_CONST(cephes_log_p3, -1.2420140846E-1);
PD_CONST(cephes_log_p4, +1.4249322787E-1);
PD_CONST(cephes_log_p5, -1.6668057665E-1);
PD_CONST(cephes_log_p6, +2.0000714765E-1);
PD_CONST(cephes_log_p7, -2.4999993993E-1);
PD_CONST(cephes_log_p8, +3.3333331174E-1);
PD_CONST(cephes_log_q1, -2.12194440e-4);
PD_CONST(cephes_log_q2, 0.693359375);
/* the smallest non denormalized double number */
PI_CONST_64(min_norm_pos , 0x0080'0000'0000'0000);
PI_CONST_64(mant_mask , 0x7ff0'0000'0000'0000);
PI_CONST_64(inv_mant_mask, ~0x7ff0'0000'0000'0000);
PI_CONST_64(inv_sign_mask, 0x7FFF'FFFF'FFFF'FFFF);
PI_CONST_64(sign_mask , ~0x7FFF'FFFF'FFFF'FFFF);
//constants of size 32bit
PI_CONST_32(1, 0x1);
PI_CONST_32(0x7f, 0x7f);
__m256d _my_mm256_log_pd(__m256d x) {
__m256d one = *(__m256d*)_pd_1;
__m256d invalid_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_LE_OQ);
x = _mm256_max_pd(x, *(__m256d*)_i64_min_norm_pos);
//"native" computation in double requires _mm256_cvtepi64_pd
//which is AVX512
//convert double to float -> perform calculation in float -> convert back
const int MANTISA_SIZE = 23;
__m128i emm0 = _mm_srli_epi32(_mm_castps_si128(_mm256_cvtpd_ps(x)), MANTISA_SIZE);
x = _mm256_and_pd(x, *(__m256d*)_i64_inv_mant_mask);
x = _mm256_or_pd(x, *(__m256d*)_pd_0p5);
emm0 = _mm_sub_epi32(emm0, *(__m128i*)_i32_0x7f);
__m256d e = _mm256_cvtps_pd(_mm_cvtepi32_ps(emm0));
//=======================================================
e = _mm256_add_pd(e, one);
__m256d mask = _mm256_cmp_pd(x, *(__m256d*)_pd_cephes_SQRTHF, _CMP_LT_OQ);
__m256d tmp = _mm256_and_pd(x, mask);
x = _mm256_sub_pd(x, one);
e = _mm256_sub_pd(e, _mm256_and_pd(one, mask));
x = _mm256_add_pd(x, tmp);
__m256d y = *(__m256d*)_pd_cephes_log_p0;
y = _mm256_mul_pd(y, x);
y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p1);
y = _mm256_mul_pd(y, x);
y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p2);
y = _mm256_mul_pd(y, x);
y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p3);
y = _mm256_mul_pd(y, x);
y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p4);
y = _mm256_mul_pd(y, x);
y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p5);
y = _mm256_mul_pd(y, x);
y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p6);
y = _mm256_mul_pd(y, x);
y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p7);
y = _mm256_mul_pd(y, x);
y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p8);
y = _mm256_mul_pd(y, x);
__m256d z = _mm256_mul_pd(x, x);
y = _mm256_mul_pd(y, z);
tmp = _mm256_mul_pd(e, *(__m256d*)_pd_cephes_log_q1);
y = _mm256_add_pd(y, tmp);
tmp = _mm256_mul_pd(z, *(__m256d*)_pd_0p5);
y = _mm256_sub_pd(y, tmp);
tmp = _mm256_mul_pd(e, *(__m256d*)_pd_cephes_log_q2);
x = _mm256_add_pd(x, y);
x = _mm256_add_pd(x, tmp);
return _mm256_or_pd(x, invalid_mask); // negative arg will be NAN
}
我的测试代码:
#include <chrono>
#include <algorithm>
#include <stdio.h>
void findMax(double * data, size_t dataLen){
double dataMax = std::log(data[0]);
for (size_t i = 1; i < dataLen; i++) {
dataMax = std::max(dataMax, std::log(data[i]));
}
printf("%f\n", dataMax);
}
void findMaxSimd(double * data, size_t dataLen) {
__m256d dataMax = _mm256_load_pd(data);
dataMax = _my_mm256_log_pd(dataMax);
size_t dataLen4 = dataLen - dataLen % 4;
for (size_t i = 4; i < dataLen4; i += 4) {
__m256d tmp = _mm256_load_pd(data + i);
__m256d s = _my_mm256_log_pd(tmp);
dataMax = _mm256_max_pd(dataMax, s);
}
alignas(32) double res[4];
_mm256_store_pd(res, dataMax);
double m = std::max(res[0], std::max(res[1], std::max(res[2], res[3])));
//process rest of array that is not modulable by 4
for (size_t i = dataLen4; i < dataLen; i++) {
m = std::max(m, std::log(data[i]));
}
printf("%f\n", m);
}
int main(int argc, char ** argv) {
std::uniform_real_distribution<float> uniform_distf(0, 20);
std::default_random_engine genf;
size_t dataLen = 4 * 50'000'000 + 2;
double * alignedData = (double *)_aligned_malloc(dataLen * sizeof(double), 32);
for (size_t i = 0; i < dataLen; i++) {
alignedData[i] = uniform_distf(genf);
}
auto t00 = std::chrono::high_resolution_clock::now();
findMax(alignedData, dataLen);
auto tc = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t00).count();
printf("%lld ms\n", tc);
t00 = std::chrono::high_resolution_clock::now();
findMaxSimd(alignedData, dataLen);
tc = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t00).count();
printf("%lld ms\n", tc);
_aligned_free(alignedData);
return 0;
}