最近我一直在尝试一些SSE编码来加速简单的计算(加法和乘法),我已经被告知SSEx的速度提升了2倍。但是我的结果显示只有1.25倍的提升,我的代码有什么问题吗?
我尝试将输入数组声明为全局变量以保持地址连续性,而不是在SSE部分中使用局部变量,两者都是徒劳的。
以下是编译的代码 g ++ -mfpath = sse -mmmx -msse -msse2 -msse4.1 -O -Wall test.c
#define N 32768
#include<stdio.h>
#include<stdlib.h>
#include<stdint.h>
#include <smmintrin.h> //sse4.1
#include <emmintrin.h> //sse2
#include <xmmintrin.h> //sse
#include <mmintrin.h> //mmx
#include <time.h>
#include <string.h>
void init_with_rand(float *array);
float input1[N];
float input2[N];
float input3[N];
float output1[N];
float output2[N];
__m128 A,B,C,MUX,SUM;
int main(void)
{
clock_t t1, t2;
int i,j;
init_with_rand(input1);
init_with_rand(input2);
init_with_rand(input3);
t1 = clock();
for(j = 0; j < 1000000; j++){
for(i = 0; i < N; i++){
output1[i] = input1[i] * input2[i] + input3[i];
}
}
t1 = clock()-t1;
printf ("It took me %d clicks (%f seconds).\n",t1,((float)t1)/CLOCKS_PER_SEC);
/////////////////////////////////////////////////////////////////////////////////
t2 = clock();
for(j = 0; j < 1000000; j++){
for(i = 0; i < N; i+=4){
A = _mm_load_ps(input1+i);
B = _mm_load_ps(input2+i);
C = _mm_load_ps(input3+i);
MUX = _mm_mul_ps(A, B);
SUM = _mm_add_ps( MUX , C);
_mm_store_ps(output2+i, SUM);
}
}
t2 = clock()-t2;
printf ("It took me %d clicks (%f seconds).\n",t2,((float)t2)/CLOCKS_PER_SEC);
printf ("Performance is increased by %f times.\n",((float)t1/(float)t2));
if(!memcmp(output1,output2,N))
printf("Valid\n");
else if(memcmp(output1,output2,N))
printf("Invalid\n");
else
printf("Error\n");
return 0;
}
void init_with_rand(float *array)
{
int i;
for( i = 0; i < N; i++)
array[i] = static_cast <float> (rand()) / static_cast <float> (RAND_MAX);
}
感谢您的任何建议!