AVX2-方法比传统版本慢14倍

时间:2019-06-29 10:47:43

标签: c++ performance avx2

我已经从http://gruntthepeon.free.fr/ssemath/重写了对数函数,以用于double和AVX2。

但是,整个功能比常规C / C ++版本(1.1s)慢14倍(15s)。当我注释所有使用_mm256_sub_pd的ines时,AVX2的运行速度(300毫秒)比常规C / C ++版本快3倍。怎么了?当我重写sincos并将exp转换为AVX时,没有问题,并且相对于常规C / C ++版本,速度提高了3倍-5倍。

#include <immintrin.h>     //AVX2


#define PD_CONST(Name, Val) \
  static const alignas(32) double _pd_##Name[4] = { Val, Val, Val, Val }

#define PI_CONST_64(Name, Val) \
  static const alignas(32) uint64_t _i64_##Name[4] = { Val, Val, Val, Val }

#define PI_CONST_32(Name,  Val) \
  static const alignas(32) uint32_t _i32_##Name[4] = { Val, Val, Val, Val }

//constants of size 64bit
PD_CONST(1, 1.0f);
PD_CONST(0p5, 0.5f);

PD_CONST(cephes_SQRTHF, 0.707106781186547524);
PD_CONST(cephes_log_p0, 7.0376836292E-2);
PD_CONST(cephes_log_p1, -1.1514610310E-1);
PD_CONST(cephes_log_p2, 1.1676998740E-1);
PD_CONST(cephes_log_p3, -1.2420140846E-1);
PD_CONST(cephes_log_p4, +1.4249322787E-1);
PD_CONST(cephes_log_p5, -1.6668057665E-1);
PD_CONST(cephes_log_p6, +2.0000714765E-1);
PD_CONST(cephes_log_p7, -2.4999993993E-1);
PD_CONST(cephes_log_p8, +3.3333331174E-1);
PD_CONST(cephes_log_q1, -2.12194440e-4);
PD_CONST(cephes_log_q2, 0.693359375);


/* the smallest non denormalized double number */
PI_CONST_64(min_norm_pos ,  0x0080'0000'0000'0000);
PI_CONST_64(mant_mask    ,  0x7ff0'0000'0000'0000);
PI_CONST_64(inv_mant_mask, ~0x7ff0'0000'0000'0000);

PI_CONST_64(inv_sign_mask, 0x7FFF'FFFF'FFFF'FFFF);
PI_CONST_64(sign_mask   , ~0x7FFF'FFFF'FFFF'FFFF);

//constants of size 32bit
PI_CONST_32(1, 0x1);
PI_CONST_32(0x7f, 0x7f);

__m256d _my_mm256_log_pd(__m256d x) {
    __m256d one = *(__m256d*)_pd_1;

    __m256d invalid_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_LE_OQ);

    x = _mm256_max_pd(x, *(__m256d*)_i64_min_norm_pos);

    //"native" computation in double requires _mm256_cvtepi64_pd
    //which is AVX512   
    //convert double to float -> perform calculation in float -> convert back
    const int MANTISA_SIZE = 23;
    __m128i emm0 = _mm_srli_epi32(_mm_castps_si128(_mm256_cvtpd_ps(x)), MANTISA_SIZE);

    x = _mm256_and_pd(x, *(__m256d*)_i64_inv_mant_mask);
    x = _mm256_or_pd(x, *(__m256d*)_pd_0p5);

    emm0 = _mm_sub_epi32(emm0, *(__m128i*)_i32_0x7f);   
    __m256d e = _mm256_cvtps_pd(_mm_cvtepi32_ps(emm0));
    //=======================================================

    e = _mm256_add_pd(e, one);

    __m256d mask = _mm256_cmp_pd(x, *(__m256d*)_pd_cephes_SQRTHF, _CMP_LT_OQ);
    __m256d tmp = _mm256_and_pd(x, mask);
    x = _mm256_sub_pd(x, one);
    e = _mm256_sub_pd(e, _mm256_and_pd(one, mask));
    x = _mm256_add_pd(x, tmp);

    __m256d y = *(__m256d*)_pd_cephes_log_p0;
    y = _mm256_mul_pd(y, x);
    y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p1);
    y = _mm256_mul_pd(y, x);
    y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p2);
    y = _mm256_mul_pd(y, x);
    y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p3);
    y = _mm256_mul_pd(y, x);
    y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p4);
    y = _mm256_mul_pd(y, x);
    y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p5);
    y = _mm256_mul_pd(y, x);
    y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p6);
    y = _mm256_mul_pd(y, x);
    y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p7);
    y = _mm256_mul_pd(y, x);
    y = _mm256_add_pd(y, *(__m256d*)_pd_cephes_log_p8);
    y = _mm256_mul_pd(y, x);

    __m256d z = _mm256_mul_pd(x, x);
    y = _mm256_mul_pd(y, z);


    tmp = _mm256_mul_pd(e, *(__m256d*)_pd_cephes_log_q1);
    y = _mm256_add_pd(y, tmp);


    tmp = _mm256_mul_pd(z, *(__m256d*)_pd_0p5);
    y = _mm256_sub_pd(y, tmp);

    tmp = _mm256_mul_pd(e, *(__m256d*)_pd_cephes_log_q2);
    x = _mm256_add_pd(x, y);
    x = _mm256_add_pd(x, tmp);

    return _mm256_or_pd(x, invalid_mask); // negative arg will be NAN    
}

我的测试代码:

#include <chrono>
#include <algorithm>
#include <stdio.h>


void findMax(double * data, size_t dataLen){
    double dataMax = std::log(data[0]);

    for (size_t i = 1; i < dataLen; i++) {
        dataMax = std::max(dataMax, std::log(data[i]));
    }

    printf("%f\n", dataMax);
}

void findMaxSimd(double * data, size_t dataLen) {   
    __m256d dataMax = _mm256_load_pd(data);
    dataMax = _my_mm256_log_pd(dataMax);

    size_t dataLen4 = dataLen - dataLen % 4;

    for (size_t i = 4; i < dataLen4; i += 4) {
        __m256d tmp = _mm256_load_pd(data + i);
        __m256d s = _my_mm256_log_pd(tmp);
        dataMax = _mm256_max_pd(dataMax, s);        
    }

    alignas(32)  double res[4];
    _mm256_store_pd(res, dataMax);

    double m = std::max(res[0], std::max(res[1], std::max(res[2], res[3])));

    //process rest of array that is not modulable by 4
    for (size_t i = dataLen4; i < dataLen; i++) {
        m = std::max(m, std::log(data[i]));
    }

    printf("%f\n", m);
}


int main(int argc, char ** argv) {  
    std::uniform_real_distribution<float> uniform_distf(0, 20);
    std::default_random_engine genf;

    size_t dataLen = 4 * 50'000'000 + 2;

    double * alignedData = (double *)_aligned_malloc(dataLen * sizeof(double), 32);

    for (size_t i = 0; i < dataLen; i++) {
        alignedData[i] = uniform_distf(genf);
    }

    auto t00 = std::chrono::high_resolution_clock::now();   
    findMax(alignedData, dataLen);  
    auto tc = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t00).count();
    printf("%lld ms\n", tc);


    t00 = std::chrono::high_resolution_clock::now();    
    findMaxSimd(alignedData, dataLen);
    tc = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - t00).count();
    printf("%lld ms\n", tc);

    _aligned_free(alignedData);

    return 0;
} 

0 个答案:

没有答案