我使用基本的_mm256_mul_ps& _mm256_add_ps。然后将其与正常操作方法进行比较,而不使用AVX。由于我在AVX中使用浮动并同时加载8个项目,那么为什么我的加速比不超过3/4,而逻辑上它应该是8与正常操作相比。请查看我的代码并提出建议。感谢
inline double timestamp() {
struct timeval tp;
gettimeofday(&tp, NULL);
return double(tp.tv_sec) + tp.tv_usec / 1000000.;
}
void AVXsum(float *a, float *b, float *c, int ARR_SIZE){
printf("AVX Addition:\n\n");
for (int i=0; i < ARR_SIZE ; i+=8){
__m256 vecA __attribute__(( aligned(32))) = _mm256_load_ps(&a[i]); // loading 8 values starting from the address of "i"th value of array a
__m256 vecB __attribute__(( aligned(32))) = _mm256_load_ps(&b[i]); // loading 8 values starting from the address of "i"th value of array b
__m256 res __attribute__(( aligned(32))) = _mm256_add_ps(vecA,vecB); // adding 8 values of array a and b
_mm256_store_ps(&c[i],res); // storing the value in the "i"th address of another array c
printf("%f %f %f %f %f %f %f %f\n", c[i + 0], c[i + 1], c[i + 2], c[i + 3], c[i + 4], c[i + 5], c[i + 6], c[i + 7]);
}
printf("\n");
}
void AVXmul(float *a, float *b, float *c, int ARR_SIZE){
printf("AVX Multiplication:\n\n");
for (int i=0; i < ARR_SIZE ; i+=8){
__m256 vecA __attribute__(( aligned(32))) = _mm256_load_ps(&a[i]);
__m256 vecB __attribute__(( aligned(32))) = _mm256_load_ps(&b[i]);
__m256 res __attribute__(( aligned(32))) = _mm256_mul_ps(vecA,vecB);
_mm256_store_ps(&c[i],res);
printf("%f %f %f %f %f %f %f %f\n", c[i + 0], c[i + 1], c[i + 2], c[i + 3], c[i + 4], c[i + 5], c[i + 6], c[i + 7]);
}
printf("\n");
}
void Normalsum(float *a, float *b, float *c, int ARR_SIZE){
printf("Normal Addition:\n\n");
float add;
for (int i=0; i < ARR_SIZE ; i++){
add = a[i] + b[i];
c[i] = add;
printf("%f\t", c[i]);
}
printf("\n\n");
}
void Normalmul(float *a, float *b, float *c, int ARR_SIZE){
printf("Normal Multiplication:\n\n");
float mult;
for (int i=0; i < ARR_SIZE ; i++){
mult = a[i] * b[i];
c[i] = mult;
printf("%f\t", c[i]);
}
printf("\n");
}
int main(){
double time, normalsumTime, normalmulTime, avxsumTime, avxmulTime;
int size;
printf("Insert the size of array: ");
scanf("%d", &size);
// initialization of array and generating random value as per entered size stated above
float a[size] __attribute__(( aligned(32)));
for(int i=0; i<size; i++){
a[i] = (rand()%100)+1;
}
float b[size] __attribute__(( aligned(32)));
for(int i=0; i<size; i++){
b[i] = (rand()%100)+1;
}
int arrsize = sizeof(a) / sizeof (a[0]);
float c[arrsize] __attribute__(( aligned(32)));
//the function is called and time is calculated
time = timestamp();
Normalsum((float*)&a, (float*)&b , (float*)&c, arrsize);
normalsumTime = timestamp() - time;
time = timestamp();
Normalmul((float*)&a, (float*)&b , (float*)&c, arrsize);
normalmulTime = timestamp() - time;
time = timestamp();
AVXsum((float*)&a, (float*)&b , (float*)&c, arrsize);
avxsumTime = timestamp() - time;
time = timestamp();
AVXmul((float*)&a, (float*)&b , (float*)&c, arrsize);
avxmulTime = timestamp() - time;
//printing the output
cout << "Normal Sum took " << normalsumTime << " s" << endl;
cout << "Normal Mul took " << normalmulTime << " s" << endl;
cout << "AVX Sum took " << avxsumTime << " s" << endl;
cout << "AVX Mul took " << avxmulTime << " s" << endl;
cout << "Sum SpeedUP AVX2= " << normalsumTime / avxsumTime << endl;
cout << "Mul SpeedUP AVX2= " << normalmulTime / avxmulTime << endl;
cout << "===========================" << endl;
return 0;
}
答案 0 :(得分:1)
似乎
printf
等函数_mm256_store_ps
指令,您的程序不仅使用AVX计算指令。换句话说,存储器访问指令违反了性能,并且不会产生8倍的加速。 我更改了您的实施以获得更准确的结果
#include <x86intrin.h>
#include <stdio.h>
#include <time.h>
inline void AVXsum(float *a, float *b, float *c, int ARR_SIZE)
{
for (int i=0; i < ARR_SIZE ; i+=8){
//__m256 vecA __attribute__(( aligned(32))) = _mm256_load_ps(&a[i]); // loading 8 values starting from the address of "i"th value of array a
//__m256 vecB __attribute__(( aligned(32))) = _mm256_load_ps(&b[i]); // loading 8 values starting from the address of "i"th value of array b
__m256 res __attribute__(( aligned(32))) = _mm256_add_ps(_mm256_load_ps(&a[i]),_mm256_load_ps(&b[i])); // adding 8 values of array a and b
_mm256_store_ps(&c[i],res); // storing the value in the "i"th address of another array c
//printf("%f %f %f %f %f %f %f %f\n", c[i + 0], c[i + 1], c[i + 2], c[i + 3], c[i + 4], c[i + 5], c[i + 6], c[i + 7]);
}
}
inline void AVXmul(float *a, float *b, float *c, int ARR_SIZE)
{
for (int i=0; i < ARR_SIZE ; i+=8){
//__m256 vecA __attribute__(( aligned(32))) = _mm256_load_ps(&a[i]);
//__m256 vecB __attribute__(( aligned(32))) = _mm256_load_ps(&b[i]);
__m256 res __attribute__(( aligned(32))) = _mm256_mul_ps(_mm256_load_ps(&a[i]),_mm256_load_ps(&b[i]));
_mm256_store_ps(&c[i],res);
//printf("%f %f %f %f %f %f %f %f\n", c[i + 0], c[i + 1], c[i + 2], c[i + 3], c[i + 4], c[i + 5], c[i + 6], c[i + 7]);
}
//printf("\n");
}
inline void Normalsum(float *a, float *b, float *c, int ARR_SIZE)
{
float add;
for (int i=0; i < ARR_SIZE ; i++){
add = a[i] + b[i];
c[i] = add;
// printf("%f\t", c[i]);
}
// printf("\n\n");
}
inline void Normalmul(float *a, float *b, float *c, int ARR_SIZE)
{
float mult;
for (int i=0; i < ARR_SIZE ; i++){
mult = a[i] * b[i];
c[i] = mult;
//printf("%f\t", c[i]);
}
//printf("\n");
}
#define size 10000
#define arrsize size
#define NUM_LOOP 1000000
int main(){
double normalsumTime, normalmulTime, avxsumTime, avxmulTime;
struct timespec tStart, tEnd;
double tTotal , tBest=10000;
int w =0;// do-while loop counter
//int size;
printf("the size of array is: %d \n", size);
//scanf("%d", &size);
// initialization of array and generating random value as per entered size stated above
float a[size] __attribute__(( aligned(32)));
for(int i=0; i<size; i++){
a[i] = (rand()%100)+1;
}
float b[size] __attribute__(( aligned(32)));
for(int i=0; i<size; i++){
b[i] = (rand()%100)+1;
}
//int arrsize = sizeof(a) / sizeof (a[0]);
float c[arrsize] __attribute__(( aligned(32)));
//the function is called and time is calculated
printf("\nNormal Addition ... :\n\n");
do{// this loop repeat the body to record the best time
clock_gettime(CLOCK_MONOTONIC,&tStart);
//time = timestamp();
Normalsum((float*)&a, (float*)&b , (float*)&c, arrsize);
//normalsumTime = timestamp() - time;
clock_gettime(CLOCK_MONOTONIC,&tEnd);
tTotal = (tEnd.tv_sec - tStart.tv_sec);
tTotal += (tEnd.tv_nsec - tStart.tv_nsec) / 1000000000.0;
if(tTotal<tBest)
tBest=tTotal;
} while(w++ < NUM_LOOP);
normalsumTime = tBest;
tBest = 100000;
w=0;
printf("Normal Multiplication .... \n\n");
do{// this loop repeat the body to record the best time
clock_gettime(CLOCK_MONOTONIC,&tStart);
//time = timestamp();
Normalmul((float*)&a, (float*)&b , (float*)&c, arrsize);
//normalmulTime = timestamp() - time;
clock_gettime(CLOCK_MONOTONIC,&tEnd);
tTotal = (tEnd.tv_sec - tStart.tv_sec);
tTotal += (tEnd.tv_nsec - tStart.tv_nsec) / 1000000000.0;
if(tTotal<tBest)
tBest=tTotal;
} while(w++ < NUM_LOOP);
normalmulTime = tBest;
tBest = 100000;
w=0;
printf("AVX Addition....\n\n");
do{// this loop repeat the body to record the best time
clock_gettime(CLOCK_MONOTONIC,&tStart);
//time = timestamp();
AVXsum((float*)&a, (float*)&b , (float*)&c, arrsize);
//avxsumTime = timestamp() - time;
clock_gettime(CLOCK_MONOTONIC,&tEnd);
tTotal = (tEnd.tv_sec - tStart.tv_sec);
tTotal += (tEnd.tv_nsec - tStart.tv_nsec) / 1000000000.0;
if(tTotal<tBest)
tBest=tTotal;
} while(w++ < NUM_LOOP);
avxsumTime = tBest;
tBest = 100000;
w=0;
printf("AVX Multiplication ....\n\n");
do{// this loop repeat the body to record the best time
clock_gettime(CLOCK_MONOTONIC,&tStart);
//time = timestamp();
AVXmul((float*)&a, (float*)&b , (float*)&c, arrsize);
//avxmulTime = timestamp() - time;
clock_gettime(CLOCK_MONOTONIC,&tEnd);
tTotal = (tEnd.tv_sec - tStart.tv_sec);
tTotal += (tEnd.tv_nsec - tStart.tv_nsec) / 1000000000.0;
if(tTotal<tBest)
tBest=tTotal;
} while(w++ < NUM_LOOP);
avxmulTime = tBest;
//printing the output
printf("Normal Sum took %lf s\n" , normalsumTime);
printf("Normal Mul took %lf s\n", normalmulTime);
printf("AVX Sum took %lf s \n", avxsumTime);
printf( "AVX Mul took %lf s\n", avxmulTime);
printf("Sum SpeedUP AVX= %lf ", normalsumTime / avxsumTime );
printf("Mul SpeedUP AVX= %lf \n", normalmulTime / avxmulTime );
printf( "===========================\n");
return 0;
}
输出结果为:
//gcc -O2
//skylake
Normal Sum took 0.000005 s
Normal Mul took 0.000005 s
AVX Sum took 0.000001 s
AVX Mul took 0.000001 s
Sum SpeedUP AVX= 4.418283 Mul SpeedUP AVX= 4.491080