我尝试使用SSE通过uint64数组实现线性搜索 说明。我为uint16和uint32工作了,但是我得到了编译器 uint64代码的错误(linux,gcc - 最后见规范)。
我试图比较2x2 64位数字,然后以某种方式翻译结果 在我的数组的索引中。这适用于uint32(积分转到 http://schani.wordpress.com/2010/04/30/linear-vs-binary-search/):
#include <xmmintrin.h>
#include <smmintrin.h>
typedef ham_u64_t vec2uint64 __attribute__ ((vector_size (16)));
typedef ham_u32_t vec4uint32 __attribute__ ((vector_size (16)));
typedef float vec4float __attribute__ ((vector_size (16)));
typedef ham_u16_t vec8uint16 __attribute__ ((vector_size (16)));
typedef ham_u8_t vec16uint8 __attribute__ ((vector_size (16)));
// ...
vec4uint32 v1 = _mm_loadu_si128((const __m128i *)&data[start + i + 0]);
vec4uint32 v2 = _mm_loadu_si128((const __m128i *)&data[start + i + 4]);
vec4uint32 v3 = _mm_loadu_si128((const __m128i *)&data[start + i + 8]);
vec4uint32 v4 = _mm_loadu_si128((const __m128i *)&data[start + i + 12]);
vec4uint32 cmp0 = _mm_cmpeq_epi32(key4, v1);
vec4uint32 cmp1 = _mm_cmpeq_epi32(key4, v2);
vec4uint32 cmp2 = _mm_cmpeq_epi32(key4, v3);
vec4uint32 cmp3 = _mm_cmpeq_epi32(key4, v4);
vec8uint16 pack01 = __builtin_ia32_packssdw128(cmp0, cmp1);
vec8uint16 pack23 = __builtin_ia32_packssdw128(cmp2, cmp3);
vec16uint8 pack0123 = __builtin_ia32_packsswb128(pack01, pack23);
int res = __builtin_ia32_pmovmskb128(pack0123);
if (res > 0) {
int czt = __builtin_ctz(~res + 1);
return (start + i + czt);
}
这是我到目前为止为uint64提出的建议。比较有效,我只是 不知道如何处理结果,以及__builtin_ia32_packssdw()调用 不编译:
vec2uint64 v1 = _mm_loadu_si128((const __m128i *)&data[start + i + 0]);
vec2uint64 v2 = _mm_loadu_si128((const __m128i *)&data[start + i + 2]);
vec2uint64 cmp0 = _mm_cmpeq_epi64(key2, v1);
vec2uint64 cmp1 = _mm_cmpeq_epi64(key2, v2);
vec4uint32 pack01 = __builtin_ia32_packssdw(cmp0, cmp1); // error
vec4uint32 pack23 = _mm_set1_epi32(0);
vec16uint8 pack0123 = __builtin_ia32_packsswb128(pack01, pack23);
int res = __builtin_ia32_pmovmskb128(pack0123);
if (res > 0) {
int czt = __builtin_ctz(~res + 1);
return (start + i + czt);
}
错误说:
error: cannot convert 'vec1uint64 {aka __vector(2) long unsigned int}'
to '__vector(2) int' for argument '1' to '__vector(4) short int
__builtin_ia32_packssdw(__vector(2) int, __vector(2) int)'
(vec2uint64的typedef位于顶部,位于uint32的代码中。)
我的环境:
Linux ws4484 3.5.0-48-generic #72~precise1-Ubuntu SMP Tue Mar 11 20:09:08 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux
gcc version 4.6.3 (Ubuntu/Linaro 4.6.3-1ubuntu5)
我的问题不仅仅是我如何修复编译器错误,但是如果有人有更好的想法来获得匹配的数组索引,也许没有整个 包装物?
提前致谢!
答案 0 :(得分:5)
我建议不要使用内置的内在函数和隐式向量。这只有在你不使用非GCC内在函数(例如_mm_cmpeq_epi32)并且只想坚持GCC时才有意义。你可以做你想做的事情
__m128i key2 = _mm_set1_epi64x(key);
__m128i v1 = _mm_loadu_si128((const __m128i *)&data[start + i + 0]);
__m128i v2 = _mm_loadu_si128((const __m128i *)&data[start + i + 2]);
__m128i cmp0 = _mm_cmpeq_epi64(key2, v1);
__m128i cmp1 = _mm_cmpeq_epi64(key2, v2);
__m128i low2 = _mm_shuffle_epi32(cmp0,0xD8);
__m128i high2 = _mm_shuffle_epi32(cmp1,0xD8);
__m128i pack = _mm_unpacklo_epi64(low2,high2);
__m128i pack01 = _mm_packs_epi32(pack, _mm_setzero_si128());
__m128i pack0123 = _mm_packs_epi16(pack01, _mm_setzero_si128());
int res = _mm_movemask_epi8(pack0123);
您可以找到一个更有效的版本来避免打包,但是您必须使用与__builtin_ctz
不同的功能。
对于32位整数,我建议
__m128i key4 = _mm_set1_epi32(key);
__m128i v1 = _mm_loadu_si128((const __m128i *)&data[start + i + 0]);
__m128i v2 = _mm_loadu_si128((const __m128i *)&data[start + i + 4]);
__m128i v3 = _mm_loadu_si128((const __m128i *)&data[start + i + 8]);
__m128i v4 = _mm_loadu_si128((const __m128i *)&data[start + i + 12]);
__m128i cmp0 = _mm_cmpeq_epi32(key4, v1);
__m128i cmp1 = _mm_cmpeq_epi32(key4, v2);
__m128i cmp2 = _mm_cmpeq_epi32(key4, v3);
__m128i cmp3 = _mm_cmpeq_epi32(key4, v4);
__m128i pack01 = _mm_packs_epi32(cmp0, cmp1);
__m128i pack23 = _mm_packs_epi32(cmp2, cmp3);
__m128i pack0123 = _mm_packs_epi16(pack01, pack23);
int res = _mm_movemask_epi8(pack0123);
答案 1 :(得分:1)
这是一个搜索匹配64位值的完整解决方案。在这种情况下,值(namehash)是结构成员。此例程在每次迭代时比较8个64位值,并提供匹配的结构索引。
//ptr is a struct array
__m128i key2 = _mm_set1_epi64x(k); //k is the 64 bit search key
for(;;)
{
if(!ptr[i].namehash)return NULL;
__m128i v1 = _mm_set_epi64x (ptr[i+1].namehash,ptr[i].namehash);
__m128i v2 = _mm_set_epi64x (ptr[i+3].namehash,ptr[i+2].namehash);
__m128i v3 = _mm_set_epi64x (ptr[i+5].namehash,ptr[i+4].namehash);
__m128i v4 = _mm_set_epi64x (ptr[i+7].namehash,ptr[i+6].namehash);
__m128i cmp0 = _mm_cmpeq_epi64(key2, v1);
__m128i cmp1 = _mm_cmpeq_epi64(key2, v2);
__m128i cmp2 = _mm_cmpeq_epi64(key2, v3);
__m128i cmp3 = _mm_cmpeq_epi64(key2, v4);
__m128i L0 = _mm_shuffle_epi32(cmp0,0xD8);
__m128i H1 = _mm_shuffle_epi32(cmp1,0xD8);
__m128i L2 = _mm_shuffle_epi32(cmp2,0xD8);
__m128i H3 = _mm_shuffle_epi32(cmp3,0xD8);
__m128i pack0 = _mm_unpacklo_epi64(L0,H1);
__m128i pack1 = _mm_unpacklo_epi64(L2,H3);
__m128i pack01 = _mm_packs_epi32(pack0,pack1);
__m128i pack0123 = _mm_packs_epi16(pack01, _mm_setzero_si128());
res = _mm_movemask_epi8(pack0123);
if(res > 0)break;
i+=8;
}
int index = i + __builtin_ctz(res); //The struct table index to the matching struct.
重要说明:struct array length必须是8的倍数,并且至少有1个NULL尾随成员
或者,如果每次迭代只需要进行2次64位比较,则上述内容可以大大简化为:
for(;;)
{
if(!ptr[i].namehash)return NULL;
__m128i v1 = _mm_set_epi64x (ptr[i+1].namehash,ptr[i].namehash);
__m128i cmp0 = _mm_cmpeq_epi64(key2, v1);
res = _mm_movemask_epi8(cmp0);
if(res > 0)break;
i+=2;
}
int ctz = __builtin_ctz(res);
int index = i + (ctz>>5); //The struct table index to the matching struct.
答案 2 :(得分:0)
我找不到任何将64位整数转换为32位整数的指令,这就是你需要使用packssdw等等。它会变得很长而且很混乱,但应该可以工作:
所以,我认为解决方案是使用位掩码(位0,1,2,3:
这些在循环之前:
vec2uint64 mask0 = { 2, 1 };
vec2uint64 mask1 = { 8, 4 };
vec2uint64 zero = { 0, 0 };
内圈:
vec2uint64 res0 = _mm_and_si128(cmp0, mask0);
vec2uint64 res1 = _mm_and_si128(cmp1, mask1);
vec2uint64 res2 = _mm_or_si128(res0, res1);
然后我们需要将高位部分拖放到新变量的低部分:
vec2uint64 hi0 = _mm_unpackhi_epi64(res0, zero);
vec2uint64 hi1 = _mm_unpackhi_epi64(res1, zero);
vec2uint64 hi2 = _mm_or_si128(hi0, hi1);
vec2uint64 res3 = _mm_or_si128(res2, hi2);
现在,res的低64位[井,低4位,其余为零]是一个位掩码,其中一个值匹配。
int res = _mm_cvtsi128_si32(res3);
(现在你可以像之前一样计算尾随零)。