我一直在努力提高大型(千兆字节)比特阵列操作的性能。我不是SIMD专家,但似乎SIMD在所有情况下都比标量操作慢。我尝试过几次优化,包括循环展开,但无济于事。基于程序集,它似乎是因为标量可以使用寄存器。但是,如果我做了一些愚蠢的事,请告诉我。否则,我很高兴保留标量...它更简单,更简单。
/* gcc -Wall -O3 bitwise-and.c -o bitwise-and -m64 -fomit-frame-pointer -mtune=nocona -msse2 */
#ifdef ENABLE_PREFETCH
#warning "SIMD PREFETCHING ENABLED"
#else
#warning "SIMD PREFETCHING DISABLED"
#endif
#ifdef ENABLE_SIMD_UNROLLING
#warning "UNROLLING SIMD"
#else
#warning "NOT UNROLLING SIMD"
#endif
#ifdef AVOID_TEMP_VARS
#warning "AVOIDING SIMD TEMPORARY VARIABLES"
#else
#warning "USING SIMD TEMPORARY VARIABLES"
#endif
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include <signal.h>
#include <setjmp.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <emmintrin.h>
#include <xmmintrin.h>
#include <assert.h>
#define __forceinline __attribute__((always_inline))
double
microtime (void)
{
struct timeval time;
gettimeofday(&time, NULL);
return (double) time.tv_sec * 1E6 + (double) time.tv_usec;
}
__forceinline void
simd_bitwise_and (unsigned char *dst, const unsigned char *src, unsigned block_size)
{
const __m128i *wrd_ptr = (__m128i *) src;
const __m128i *wrd_end = (__m128i *) (src + block_size);
__m128i *dst_ptr = (__m128i *) dst;
_mm_empty();
do
{
__m128i xmm1;
__m128i xmm2;
#ifdef ENABLE_SIMD_UNROLLING
# ifdef ENABLE_PREFETCH
_mm_prefetch((src + 512), _MM_HINT_NTA);
# endif
xmm1 = _mm_load_si128(wrd_ptr++);
xmm2 = _mm_load_si128(dst_ptr);
xmm1 = _mm_and_si128(xmm1, xmm2);
_mm_store_si128(dst_ptr++, xmm1);
xmm1 = _mm_load_si128(wrd_ptr++);
xmm2 = _mm_load_si128(dst_ptr);
xmm1 = _mm_and_si128(xmm1, xmm2);
_mm_store_si128(dst_ptr++, xmm1);
xmm1 = _mm_load_si128(wrd_ptr++);
xmm2 = _mm_load_si128(dst_ptr);
xmm1 = _mm_and_si128(xmm1, xmm2);
_mm_store_si128(dst_ptr++, xmm1);
xmm1 = _mm_load_si128(wrd_ptr++);
xmm2 = _mm_load_si128(dst_ptr);
xmm1 = _mm_and_si128(xmm1, xmm2);
_mm_store_si128(dst_ptr++, xmm1);
#else
# ifdef AVOID_TEMP_VARS
xmm1 = _mm_and_si128(*dst_ptr, *wrd_ptr);
# else
xmm1 = _mm_load_si128(wrd_ptr);
xmm2 = _mm_load_si128(dst_ptr);
xmm1 = _mm_and_si128(xmm1, xmm2);
# endif
_mm_store_si128(dst_ptr, xmm1);
++dst_ptr;
++wrd_ptr;
#endif
} while (wrd_ptr < wrd_end);
}
__forceinline void
word_bitwise_and (unsigned char *dst, const unsigned char *src, unsigned block_size)
{
unsigned int *wrd_ptr = (unsigned int *) src;
unsigned int *wrd_end = (unsigned int *) (src + block_size);
unsigned int *dst_ptr = (unsigned int *) dst;
do
{
dst_ptr[0] &= wrd_ptr[0];
dst_ptr[1] &= wrd_ptr[1];
dst_ptr[2] &= wrd_ptr[2];
dst_ptr[3] &= wrd_ptr[3];
dst_ptr += 4;
wrd_ptr += 4;
} while (wrd_ptr < wrd_end);
}
int
main (int argc, char **argv)
{
unsigned char *dest;
unsigned char *key1;
unsigned char *key2;
size_t minlen = (1024UL * 1024UL * 512UL);
double start_time = 0.0f;
double end_time = 0.0f;
posix_memalign((void *) &key1, sizeof(__m128i), minlen);
posix_memalign((void *) &key2, sizeof(__m128i), minlen);
posix_memalign((void *) &dest, sizeof(__m128i), minlen);
key1[128] = 0xff;
key2[128] = 0x03;
// 128-bit SIMD Bitwise AND
memcpy(dest, key1, minlen);
start_time = microtime();
simd_bitwise_and(dest, key2, minlen);
end_time = microtime();
printf("Elapsed: %8.6fs\n", (end_time - start_time));
assert(0x03 == dest[128]);
// 4xWORD Bitwise AND
memcpy(dest, key1, minlen);
start_time = microtime();
word_bitwise_and(dest, key2, minlen);
end_time = microtime();
printf("Elapsed: %8.6fs\n", (end_time - start_time));
assert(0x03 == dest[128]);
free(dest);
free(key2);
free(key1);
return EXIT_SUCCESS;
}
/* vi: set et sw=2 ts=2: */
答案 0 :(得分:7)
这里发生的事情是你被懒惰的虚拟内存分配所困扰。如果您将代码更改为:
// 128-bit SIMD Bitwise AND
memcpy(dest, key1, minlen);
start_time = microtime();
simd_bitwise_and(dest, key2, minlen);
end_time = microtime();
printf("SIMD Elapsed : %8.6fs\n", (end_time - start_time));
assert(0x03 == dest[128]);
// 4xWORD Bitwise AND
memcpy(dest, key1, minlen);
start_time = microtime();
word_bitwise_and(dest, key2, minlen);
end_time = microtime();
printf("Scalar Elapsed: %8.6fs\n", (end_time - start_time));
assert(0x03 == dest[128]);
// 128-bit SIMD Bitwise AND
memcpy(dest, key1, minlen);
start_time = microtime();
simd_bitwise_and(dest, key2, minlen);
end_time = microtime();
printf("SIMD Elapsed : %8.6fs\n", (end_time - start_time));
assert(0x03 == dest[128]);
// 4xWORD Bitwise AND
memcpy(dest, key1, minlen);
start_time = microtime();
word_bitwise_and(dest, key2, minlen);
end_time = microtime();
printf("Scalar Elapsed: %8.6fs\n", (end_time - start_time));
assert(0x03 == dest[128]);
你应该看到类似这样的结果:
$ ./bitwise-and
SIMD Elapsed : 630061.000000s
Scalar Elapsed: 228156.000000s
SIMD Elapsed : 182645.000000s
Scalar Elapsed: 202697.000000s
$
说明:第一次迭代大内存分配时,就会产生页面错误,因为之前未使用的页面已接通。这为第一个基准测试提供了人为的高速时间,恰好是SIMD基准测试。在第二个及后续的基准测试中,页面都已连接,您可以获得更准确的基准测试,并且正如预期的那样,SIMD例程比标量例程略快。差异并不像预期的那么大,因为每2个负载+ 1个存储只执行一条ALU指令,因此性能受到DRAM带宽而非计算效率的限制。
作为编写基准测试代码时的一般规则:在任何实际时序测量之前始终至少调用一次基准测试例程,以便所有内存分配都正确连接。之后在循环中多次运行基准测试例程并忽略任何异常值。