我正在测试一个Zip解密变体的SSE。但是,未经优化的代码表现更好。
使用以下参数运行编译器: -msse4 -O3 会产生以下基准: -
正常测试:0.275,SSE测试:0.655
我尝试增加循环计数器,但基准测试没有太大变化。编译器正在进行哪些优化?我们是否应该无法使用SSE匹配它?
编辑:按照Jens建议的使用时间,增加循环迭代和固定的printf格式。
#include <stdio.h>
#include <stdint.h>
#include <smmintrin.h>
#include <time.h>
// Windows
#ifdef _WIN32
#include <Windows.h>
double get_wall_time()
{
LARGE_INTEGER time,freq;
if (!QueryPerformanceFrequency(&freq))
{
// Handle error
return 0;
}
if (!QueryPerformanceCounter(&time))
{
// Handle error
return 0;
}
return (double)time.QuadPart / freq.QuadPart;
}
double get_cpu_time()
{
FILETIME a,b,c,d;
if (GetProcessTimes(GetCurrentProcess(),&a,&b,&c,&d) != 0)
{
// Returns total user time.
// Can be tweaked to include kernel times as well.
return
(double)(d.dwLowDateTime |
((unsigned long long)d.dwHighDateTime << 32)) * 0.0000001;
}
else
{
// Handle error
return 0;
}
}
// Posix/Linux
#else
#include <sys/time.h>
double get_wall_time()
{
struct timeval time;
if (gettimeofday(&time,NULL))
{
// Handle error
return 0;
}
return (double)time.tv_sec + (double)time.tv_usec * .000001;
}
double get_cpu_time()
{
return (double)clock() / CLOCKS_PER_SEC;
}
#endif
static void test_sse()
{
double start = get_wall_time();
uint64_t sum = 0;
uint32_t nk0 = 0x12345678;
uint32_t nk1 = 0x23456789;
uint32_t nk2 = 0x34567890;
uint32_t nk3 = 0x45678901;
__m128i mask = _mm_set1_epi32(0xff);
uint64_t i;
for(i = 0; i < 100000000; i++)
{
uint32_t newKeys[] = {nk0, nk1, nk2, nk3};
__m128i *nk_sse = (__m128i*)(&newKeys);
__m128i opa = _mm_and_si128(*nk_sse, mask);
__m128i opr8 = _mm_srai_epi32 (*nk_sse, 8);
__m128i opr16 = _mm_srai_epi32 (*nk_sse, 16);
__m128i opr24 = _mm_srai_epi32 (*nk_sse, 24);
__m128i oprsum = _mm_add_epi32(_mm_add_epi32(_mm_add_epi32(opa, _mm_and_si128(opr8, mask)), _mm_and_si128(opr16, mask)), _mm_and_si128(opr24, mask));
uint32_t* oprsum_ptr = (uint32_t*)(&oprsum);
uint32_t sum_sse = oprsum_ptr[0] + oprsum_ptr[1] + oprsum_ptr[2] + oprsum_ptr[3];
sum += sum_sse;
nk0--;
nk1--;
nk2--;
nk3--;
}
double end = get_wall_time();
double ms = end - start;
printf("SSE Test - Sum: %lu, ms: %f\n", sum, ms);
}
static void test()
{
double start = get_wall_time();
uint64_t sum = 0;
uint32_t nk0 = 0x12345678;
uint32_t nk1 = 0x23456789;
uint32_t nk2 = 0x34567890;
uint32_t nk3 = 0x45678901;
uint64_t i;
for(i = 0; i < 100000000; i++)
{
uint8_t res0 = (uint8_t) (nk0 & 0xff);
uint8_t res1 = (uint8_t) (nk0 >> 8);
uint8_t res2 = (uint8_t) (nk0 >> 16);
uint8_t res3 = (uint8_t) (nk0 >> 24);
uint8_t res4 = (uint8_t) (nk1 & 0xff);
uint8_t res5 = (uint8_t) (nk1 >> 8);
uint8_t res6 = (uint8_t) (nk1 >> 16);
uint8_t res7 = (uint8_t) (nk1 >> 24);
uint8_t res8 = (uint8_t) (nk2 & 0xff);
uint8_t res9 = (uint8_t) (nk2 >> 8);
uint8_t res10 = (uint8_t) (nk2 >> 16);
uint8_t res11 = (uint8_t) (nk2 >> 24);
uint8_t res12 = (uint8_t) (nk3 & 0xff);
uint8_t res13 = (uint8_t) (nk3 >> 8);
uint8_t res14 = (uint8_t) (nk3 >> 16);
uint8_t res15 = (uint8_t) (nk3 >> 24);
sum += res0 + res1+ res2 + res3 + res4 + res5 + res6 + res7 + res8 + res9
+ res10 + res11 + res12 + res13 + res14 + res15;
nk0--;
nk1--;
nk2--;
nk3--;
}
double end = get_wall_time();
double ms = end - start;
printf("Normal Test - Sum: %lu, ms: %f\n", sum, ms);
}
int main (int argc, char **argv)
{
test();
test_sse();
return 0;
}
答案 0 :(得分:4)
您使用错误的工具来衡量效果。 clock
,至少在符合要求的平台上,为您提供CPU时间而不是挂钟时间。
您做错的另一件事是使用uint64_t
打印%d
。当int
小于64位时,这具有未定义的行为。那时你打印的int
可能只会收到垃圾。
编辑:现在您修复了代码,我将其编译为汇编程序(gcc选项-S
)。实际上,gcc在向量化和展开test
函数方面做得非常出色。使用我的三个编译器,我得到了截然不同的结果,所有编译都使用-O3 -march=native
ICC:
Normal Test - Sum: 169248636030, ms: 0.618356
SSE Test - Sum: 169248636030, ms: 1.059261
gcc 4.6:
Normal Test - Sum: 169248636030, ms: 0.462793
SSE Test - Sum: 169248636030, ms: 0.348453
铛:
Normal Test - Sum: 169248636030, ms: 0.625905
SSE Test - Sum: 169248636030, ms: 0.423343
因此icc和clang在非优化代码上做同样的事情,但是gcc做得更好。使用gcc和clang你的sse代码更好,而icc更宽松。一般来说,与公共领域“免费”编译器相比,icc(一种商业编译器,你必须支付真钱)的性能确实令人失望。
编辑2:
现在有了更新版本的gcc,我甚至得到了这个
gcc 4.7:
Normal Test - Sum: 169248636030, ms: 0.158695
SSE Test - Sum: 169248636030, ms: 0.406921
所以你看到“正常测试”进一步改善,而 SSE测试比以前差一点。