为什么AVX加速以下代码不符合预期?

时间:2017-03-27 10:00:06

标签: c++ parallel-processing sse simd avx

我使用基本的_mm256_mul_ps& _mm256_add_ps。然后将其与正常操作方法进行比较,而不使用AVX。由于我在AVX中使用浮动并同时加载8个项目,那么为什么我的加速比不超过3/4,而逻辑上它应该是8与正常操作相比。请查看我的代码并提出建议。感谢

inline double timestamp() {
    struct timeval tp;
    gettimeofday(&tp, NULL);
    return double(tp.tv_sec) + tp.tv_usec / 1000000.;
}

void AVXsum(float *a, float *b, float *c, int ARR_SIZE){

        printf("AVX Addition:\n\n");

        for (int i=0; i < ARR_SIZE ; i+=8){

         __m256 vecA __attribute__(( aligned(32))) = _mm256_load_ps(&a[i]); // loading 8 values starting from the address of "i"th value of array a

         __m256 vecB __attribute__(( aligned(32))) = _mm256_load_ps(&b[i]); // loading 8 values starting from the address of "i"th value of array b

         __m256 res __attribute__(( aligned(32))) = _mm256_add_ps(vecA,vecB); // adding 8 values of array a and b

         _mm256_store_ps(&c[i],res); // storing the value in the "i"th address of another array c

         printf("%f %f %f %f %f %f %f %f\n", c[i + 0], c[i + 1], c[i + 2], c[i + 3], c[i + 4], c[i + 5], c[i + 6], c[i + 7]);

        }

        printf("\n");

}

void AVXmul(float *a, float *b, float *c, int ARR_SIZE){

        printf("AVX Multiplication:\n\n");

        for (int i=0; i < ARR_SIZE ; i+=8){

         __m256 vecA __attribute__(( aligned(32))) = _mm256_load_ps(&a[i]);

         __m256 vecB __attribute__(( aligned(32))) = _mm256_load_ps(&b[i]);

         __m256 res __attribute__(( aligned(32))) = _mm256_mul_ps(vecA,vecB);

         _mm256_store_ps(&c[i],res);

         printf("%f %f %f %f %f %f %f %f\n", c[i + 0], c[i + 1], c[i + 2], c[i + 3], c[i + 4], c[i + 5], c[i + 6], c[i + 7]);

        }

        printf("\n");

}

void Normalsum(float *a, float *b, float *c, int ARR_SIZE){

        printf("Normal Addition:\n\n");

        float add;

        for (int i=0; i < ARR_SIZE ; i++){

         add = a[i] + b[i];
         c[i] = add;

         printf("%f\t", c[i]);

        }

        printf("\n\n");

}

void Normalmul(float *a, float *b, float *c, int ARR_SIZE){

        printf("Normal Multiplication:\n\n");

        float mult;

        for (int i=0; i < ARR_SIZE ; i++){

         mult = a[i] * b[i];
         c[i] = mult;

         printf("%f\t", c[i]);

        }

        printf("\n");

}

int main(){

    double time, normalsumTime, normalmulTime, avxsumTime, avxmulTime;

    int size;
    printf("Insert the size of array: ");
    scanf("%d", &size);

    // initialization of array and generating random value as per entered size stated above
    float a[size] __attribute__(( aligned(32)));
    for(int i=0; i<size; i++){
        a[i] = (rand()%100)+1;
    }

    float b[size] __attribute__(( aligned(32)));
    for(int i=0; i<size; i++){
        b[i] = (rand()%100)+1;
    }

    int arrsize = sizeof(a) / sizeof (a[0]);
    float c[arrsize] __attribute__(( aligned(32)));


    //the function is called and time is calculated
    time = timestamp();
    Normalsum((float*)&a, (float*)&b , (float*)&c, arrsize);
    normalsumTime = timestamp() - time;

    time = timestamp();
    Normalmul((float*)&a, (float*)&b , (float*)&c, arrsize);
    normalmulTime = timestamp() - time;

    time = timestamp();
    AVXsum((float*)&a, (float*)&b , (float*)&c, arrsize);
    avxsumTime = timestamp() - time;

    time = timestamp();
    AVXmul((float*)&a, (float*)&b , (float*)&c, arrsize);
    avxmulTime = timestamp() - time;

    //printing the output
    cout << "Normal Sum took " << normalsumTime << " s" << endl;
    cout << "Normal Mul took " << normalmulTime << " s" << endl;
    cout << "AVX Sum took " << avxsumTime << " s" << endl;
    cout << "AVX Mul took " << avxmulTime << " s" << endl;
    cout << "Sum SpeedUP AVX2= " << normalsumTime / avxsumTime << endl;
    cout << "Mul SpeedUP AVX2= " << normalmulTime / avxmulTime << endl;
    cout << "===========================" << endl;

   return 0;

}

1 个答案:

答案 0 :(得分:1)

似乎

  1. 您不会重复您的功能以获得最佳时间。这是必不可少的!
  2. 您的函数不应包含printf等函数
  3. 每次迭代都使用
  4. _mm256_store_ps指令,您的程序不仅使用AVX计算指令。换句话说,存储器访问指令违反了性能,并且不会产生8倍的加速。
  5. 我更改了您的实施以获得更准确的结果

    #include <x86intrin.h>
    #include <stdio.h>
    #include <time.h>
    
    
    inline void AVXsum(float *a, float *b, float *c, int ARR_SIZE)
    {
        for (int i=0; i < ARR_SIZE ; i+=8){
            //__m256 vecA __attribute__(( aligned(32))) = _mm256_load_ps(&a[i]); // loading 8 values starting from the address of "i"th value of array a
            //__m256 vecB __attribute__(( aligned(32))) = _mm256_load_ps(&b[i]); // loading 8 values starting from the address of "i"th value of array b
    
            __m256 res __attribute__(( aligned(32))) = _mm256_add_ps(_mm256_load_ps(&a[i]),_mm256_load_ps(&b[i])); // adding 8 values of array a and b
    
            _mm256_store_ps(&c[i],res); // storing the value in the "i"th address of another array c
    
            //printf("%f %f %f %f %f %f %f %f\n", c[i + 0], c[i + 1], c[i + 2], c[i + 3], c[i + 4], c[i + 5], c[i + 6], c[i + 7]);
        }
    }
    
    inline void AVXmul(float *a, float *b, float *c, int ARR_SIZE)
    {
        for (int i=0; i < ARR_SIZE ; i+=8){
            //__m256 vecA __attribute__(( aligned(32))) = _mm256_load_ps(&a[i]);
            //__m256 vecB __attribute__(( aligned(32))) = _mm256_load_ps(&b[i]);
    
            __m256 res __attribute__(( aligned(32))) = _mm256_mul_ps(_mm256_load_ps(&a[i]),_mm256_load_ps(&b[i]));
    
            _mm256_store_ps(&c[i],res);
    
            //printf("%f %f %f %f %f %f %f %f\n", c[i + 0], c[i + 1], c[i + 2], c[i + 3], c[i + 4], c[i + 5], c[i + 6], c[i + 7]);
        }
        //printf("\n");
    }
    
    inline void Normalsum(float *a, float *b, float *c, int ARR_SIZE)
    {
        float add;
        for (int i=0; i < ARR_SIZE ; i++){
    
            add = a[i] + b[i];
            c[i] = add;
    
        // printf("%f\t", c[i]);
        }
        // printf("\n\n");
    }
    
    inline void Normalmul(float *a, float *b, float *c, int ARR_SIZE)
    {
        float mult;
        for (int i=0; i < ARR_SIZE ; i++){
    
            mult = a[i] * b[i];
            c[i] = mult;
    
            //printf("%f\t", c[i]);
        }
    
        //printf("\n");
    }
    #define size 10000
    #define arrsize size
    #define NUM_LOOP 1000000
    
    int main(){
    
        double  normalsumTime, normalmulTime, avxsumTime, avxmulTime;
        struct timespec tStart, tEnd;
        double tTotal , tBest=10000;
        int w =0;// do-while loop counter
    
        //int size;
        printf("the size of array is: %d \n", size);
        //scanf("%d", &size);
    
        // initialization of array and generating random value as per entered size stated above
        float a[size] __attribute__(( aligned(32)));
        for(int i=0; i<size; i++){
            a[i] = (rand()%100)+1;
        }
    
        float b[size] __attribute__(( aligned(32)));
        for(int i=0; i<size; i++){
            b[i] = (rand()%100)+1;
        }
    
        //int arrsize = sizeof(a) / sizeof (a[0]);
        float c[arrsize] __attribute__(( aligned(32)));
    
        //the function is called and time is calculated
        printf("\nNormal Addition ... :\n\n");
        do{// this loop repeat the body to record the best time
            clock_gettime(CLOCK_MONOTONIC,&tStart);
    
            //time = timestamp();
            Normalsum((float*)&a, (float*)&b , (float*)&c, arrsize);
            //normalsumTime = timestamp() - time;
    
            clock_gettime(CLOCK_MONOTONIC,&tEnd);
            tTotal = (tEnd.tv_sec - tStart.tv_sec);
            tTotal += (tEnd.tv_nsec - tStart.tv_nsec) / 1000000000.0;
    
            if(tTotal<tBest)
                tBest=tTotal;
        } while(w++ < NUM_LOOP);
    
        normalsumTime = tBest;
        tBest = 100000;
        w=0;
    
        printf("Normal Multiplication .... \n\n");
        do{// this loop repeat the body to record the best time
            clock_gettime(CLOCK_MONOTONIC,&tStart);
    
            //time = timestamp();
            Normalmul((float*)&a, (float*)&b , (float*)&c, arrsize);
            //normalmulTime = timestamp() - time;
    
            clock_gettime(CLOCK_MONOTONIC,&tEnd);
            tTotal = (tEnd.tv_sec - tStart.tv_sec);
            tTotal += (tEnd.tv_nsec - tStart.tv_nsec) / 1000000000.0;
    
            if(tTotal<tBest)
                tBest=tTotal;
        } while(w++ < NUM_LOOP);
    
        normalmulTime = tBest;
        tBest = 100000;
        w=0;
    
        printf("AVX Addition....\n\n");
        do{// this loop repeat the body to record the best time
            clock_gettime(CLOCK_MONOTONIC,&tStart);
    
            //time = timestamp();
            AVXsum((float*)&a, (float*)&b , (float*)&c, arrsize);
            //avxsumTime = timestamp() - time;
    
            clock_gettime(CLOCK_MONOTONIC,&tEnd);
            tTotal = (tEnd.tv_sec - tStart.tv_sec);
            tTotal += (tEnd.tv_nsec - tStart.tv_nsec) / 1000000000.0;
    
            if(tTotal<tBest)
                tBest=tTotal;
        } while(w++ < NUM_LOOP);
    
        avxsumTime = tBest;
        tBest = 100000;
        w=0;
    
        printf("AVX Multiplication ....\n\n");
        do{// this loop repeat the body to record the best time
            clock_gettime(CLOCK_MONOTONIC,&tStart);
            //time = timestamp();
    
            AVXmul((float*)&a, (float*)&b , (float*)&c, arrsize);
            //avxmulTime = timestamp() - time;
    
            clock_gettime(CLOCK_MONOTONIC,&tEnd);
            tTotal = (tEnd.tv_sec - tStart.tv_sec);
            tTotal += (tEnd.tv_nsec - tStart.tv_nsec) / 1000000000.0;
    
            if(tTotal<tBest)
                tBest=tTotal;
        } while(w++ < NUM_LOOP);
    
        avxmulTime = tBest;
    
        //printing the output
        printf("Normal Sum took %lf s\n" , normalsumTime);
        printf("Normal Mul took %lf s\n",  normalmulTime);
        printf("AVX Sum took %lf s \n", avxsumTime);
        printf( "AVX Mul took %lf s\n", avxmulTime);
        printf("Sum SpeedUP AVX= %lf ", normalsumTime / avxsumTime );
        printf("Mul SpeedUP AVX= %lf \n", normalmulTime / avxmulTime );
        printf( "===========================\n");
    
        return 0;
    
    }
    

    输出结果为:

    //gcc -O2 
    //skylake
    Normal Sum took 0.000005 s
    Normal Mul took 0.000005 s
    AVX Sum took 0.000001 s 
    AVX Mul took 0.000001 s
    Sum SpeedUP AVX= 4.418283 Mul SpeedUP AVX= 4.491080