使用AVX2指令的循环的可变执行时间

时间:2019-06-13 16:40:41

标签: c cpu

我想使用AVX2指令将浮点缓冲区转换为int8缓冲区。

当我测量转换函数的时间时,我可以看到它非常稳定,但有时一种测量方法确实要比另一种测量方法高。

这是我的代码

#include <immintrin.h>
#include <stdio.h>
#include <time.h>

static __m64 func (const __m256* coef_8xFP32_,
            float** input_1xFP32_ptr_)
{
    __m256   input_8xFP32 = {};
    __m256   coeffed_8xFP32 = {};
    __m256i  convert_8xINT32;
    __m128i* convertLow_4xINT8_ptr = nullptr;
    __m128i* convertHigh_4xINT8_ptr = nullptr;
    __m128i  packed_8xINT16 = {};
    __m64*   packedLow_8xINT8_ptr = nullptr;
    __m64*   packedHigh_8xINT8_ptr = nullptr;
    __m64    packed_8xINT8 = {};

    // init 8xFP32 with input data
    input_8xFP32 = _mm256_set_ps((*input_1xFP32_ptr_)[7], (*input_1xFP32_ptr_)[6], (*input_1xFP32_ptr_)[5], (*input_1xFP32_ptr_)[4],
                                 (*input_1xFP32_ptr_)[3], (*input_1xFP32_ptr_)[2], (*input_1xFP32_ptr_)[1], (*input_1xFP32_ptr_)[0]);

    (*input_1xFP32_ptr_) += 8;

    // multiple 8xFP32 per coef
    coeffed_8xFP32 = _mm256_mul_ps(input_8xFP32, *coef_8xFP32_);

    // convert 8xFP32 into 8xINT32
    convert_8xINT32 = _mm256_cvtps_epi32 (coeffed_8xFP32);

    // pack 8xINT32 into 8xINT16
    convertLow_4xINT8_ptr  = (__m128i*)&convert_8xINT32;
    convertHigh_4xINT8_ptr = (__m128i*)&convert_8xINT32 + 1;
    packed_8xINT16 = _mm_packs_epi32 (*convertLow_4xINT8_ptr, *convertHigh_4xINT8_ptr);

    // pack 8xINT16 into 8xINT8
    packedLow_8xINT8_ptr  = (__m64*)&packed_8xINT16;
    packedHigh_8xINT8_ptr = (__m64*)&packed_8xINT16 + 1;
    packed_8xINT8 = _mm_packs_pi16 (*packedLow_8xINT8_ptr, *packedHigh_8xINT8_ptr);

    return (packed_8xINT8);
}

int main(int argc, char** argv)
{
    const int bufferSize = 8 * 10000;
    float input[bufferSize] = {};
    int8_t output[bufferSize] = {};
    float coef = 1.;

    const int statSize = 1000;
    struct timespec start;
    struct timespec end;
    double val[statSize] = {};
    double min = 1;
    double max = 0;
    double accu = 0;
    double mean = 0;

    for (int i = 0; i < statSize; i++)
    {
        srand(time(NULL));
        for (int j = 0; j < bufferSize; j += 1)
            input[j] = (rand() % 2560) / 10. - 128;

        __m256 coef_8xFP32 = _mm256_set1_ps(coef);
        float* input_1xFP32_ptr = (float*)input;
        __m64* output_ptr_8xINT8 = (__m64*)output;

        clock_gettime(CLOCK_REALTIME, &start);

        for (int j = 0; j < bufferSize; j += 8)
        {
            *output_ptr_8xINT8 = func (&coef_8xFP32, &input_1xFP32_ptr);
            ++output_ptr_8xINT8;
        }

        clock_gettime(CLOCK_REALTIME, &end);

        val[i] = (end.tv_sec - start.tv_sec) / 1e-3 + (end.tv_nsec - start.tv_nsec) / 1e6;
        if (val[i] > max)
            max = val[i];
        if (val[i] < min)
            min = val[i];
        accu += val[i];
    }

//  for (int j = 0; j < bufferSize; j += 1)
//      printf ("%d %.3f %d\n", j, input[j], output[j]);

    mean = accu / statSize;
    for (int i = 0; i < statSize; i++)
        if (val[i] > ((max - mean) / 2 + mean))
            printf ("%3d %f\n", i, val[i]);

    printf ("min   %f\n", min);
    printf ("max   %f\n", max);
    printf ("mean  %f\n", mean);
    printf ("DELTA %f\n", max / mean);

    return (0);
}

还有编译命令

g++ -Wall -Werror -O3 -std=c++11 -mavx2 -c floatToInt8.cpp -o floatToInt8.o 
g++ floatToInt8.o -o floatToInt8 

我期望最小值,最大值和平均值非常相似。 这是我的结果(我在几台Linux主机上进行了尝试,结果也相似):

  0 0.060360
 34 0.068733
 92 0.067925
369 0.058768
565 0.059178
604 0.055466
621 0.065434
635 0.068510
min   0.036119
max   0.068733
mean  0.040500
DELTA 1.697113

如何解释这几个太高的值?

0 个答案:

没有答案