分析表明这个功能对我的应用来说是一个真正的瓶颈:
static inline int countEqualChars(const char* string1, const char* string2, int size) {
int r = 0;
for (int j = 0; j < size; ++j) {
if (string1[j] == string2[j]) {
++r;
}
}
return r;
}
即使使用-O3
和-march=native
,G ++ 4.7.2也不会对此函数进行矢量化(我检查了汇编程序输出)。现在,我不是SSE和朋友的专家,但我认为同时比较多个角色应该更快。关于如何加快速度的任何想法?目标架构是x86-64。
答案 0 :(得分:9)
当然可以。
pcmpeqb
比较两个16字节的向量,并产生一个带有零的向量,它们不同,而-1则匹配。使用它来一次比较16个字节,将结果添加到累加器向量(确保累加最多255个向量比较的结果以避免溢出)。完成后,累加器中有16个结果。求它们并否定得到相等元素的数量。
如果长度非常短,那么很难从这种方法中获得显着的加速。如果长度很长,那么值得追求。
答案 1 :(得分:7)
矢量化的编译器标志:
-ftree-vectorize
-ftree-vectorize -march=<your_architecture>
(使用计算机上可用的所有指令集扩展,而不仅仅是像SSE2 for x86-64这样的基线)。使用-march=native
来优化运行编译器的机器。)-march=<foo>
也设置-mtune=<foo>
,这也是一件好事。
使用SSEx内在函数:
Padd并将缓冲区对齐到16个字节(根据您实际要使用的向量大小)
使用_mm_set1_epi8(0)
countU8
对于所有n / 16个输入(子)向量,请执行:
使用_mm_load_si128或_mm_loadu_si128从两个字符串加载16个字符(对于未对齐的加载)
_mm_cmpeq_epi8
并行比较八位字节。每个匹配产生0xFF
( - 1),否则产生0x00
。
使用_mm_sub_epi8(减-1 - > +1)从countU8
中减去上述结果向量
始终在255个周期后,必须将16个8位计数器提取为更大的整数类型以防止溢出。有关如何执行此操作的详细解答,请参阅解压缩和水平添加:https://stackoverflow.com/a/10930706/1175253
代码:
#include <iostream>
#include <vector>
#include <cassert>
#include <cstdint>
#include <climits>
#include <cstring>
#include <emmintrin.h>
#ifdef __SSE2__
#if !defined(UINTPTR_MAX) || !defined(UINT64_MAX) || !defined(UINT32_MAX)
# error "Limit macros are not defined"
#endif
#if UINTPTR_MAX == UINT64_MAX
#define PTR_64
#elif UINTPTR_MAX == UINT32_MAX
#define PTR_32
#else
# error "Current UINTPTR_MAX is not supported"
#endif
template<typename T>
void print_vector(std::ostream& out,const __m128i& vec)
{
static_assert(sizeof(vec) % sizeof(T) == 0,"Invalid element size");
std::cout << '{';
const T* const end = reinterpret_cast<const T*>(&vec)-1;
const T* const upper = end+(sizeof(vec)/sizeof(T));
for(const T* elem = upper;
elem != end;
--elem
)
{
if(elem != upper)
std::cout << ',';
std::cout << +(*elem);
}
std::cout << '}' << std::endl;
}
#define PRINT_VECTOR(_TYPE,_VEC) do{ std::cout << #_VEC << " : "; print_vector<_TYPE>(std::cout,_VEC); } while(0)
///@note SSE2 required (macro: __SSE2__)
///@warning Not tested!
size_t counteq_epi8(const __m128i* a_in,const __m128i* b_in,size_t count)
{
assert(a_in != nullptr && (uintptr_t(a_in) % 16) == 0);
assert(b_in != nullptr && (uintptr_t(b_in) % 16) == 0);
//assert(count > 0);
/*
//maybe not so good with all that branching and additional loop variables
__m128i accumulatorU8 = _mm_set1_epi8(0);
__m128i sum2xU64 = _mm_set1_epi8(0);
for(size_t i = 0;i < count;++i)
{
//this operation could also be unrolled, where multiple result registers would be accumulated
accumulatorU8 = _mm_sub_epi8(accumulatorU8,_mm_cmpeq_epi8(*a_in++,*b_in++));
if(i % 255 == 0)
{
//before overflow of uint8, the counter will be extracted
__m128i sum2xU16 = _mm_sad_epu8(accumulatorU8,_mm_set1_epi8(0));
sum2xU64 = _mm_add_epi64(sum2xU64,sum2xU16);
//reset accumulatorU8
accumulatorU8 = _mm_set1_epi8(0);
}
}
//blindly accumulate remaining values
__m128i sum2xU16 = _mm_sad_epu8(accumulatorU8,_mm_set1_epi8(0));
sum2xU64 = _mm_add_epi64(sum2xU64,sum2xU16);
//do a horizontal addition of the two counter values
sum2xU64 = _mm_add_epi64(sum2xU64,_mm_srli_si128(sum2xU64,64/8));
#if defined PTR_64
return _mm_cvtsi128_si64(sum2xU64);
#elif defined PTR_32
return _mm_cvtsi128_si32(sum2xU64);
#else
# error "macro PTR_(32|64) is not set"
#endif
*/
__m128i sum2xU64 = _mm_set1_epi32(0);
while(count--)
{
__m128i matches = _mm_sub_epi8(_mm_set1_epi32(0),_mm_cmpeq_epi8(*a_in++,*b_in++));
__m128i sum2xU16 = _mm_sad_epu8(matches,_mm_set1_epi32(0));
sum2xU64 = _mm_add_epi64(sum2xU64,sum2xU16);
#ifndef NDEBUG
PRINT_VECTOR(uint16_t,sum2xU64);
#endif
}
//do a horizontal addition of the two counter values
sum2xU64 = _mm_add_epi64(sum2xU64,_mm_srli_si128(sum2xU64,64/8));
#ifndef NDEBUG
std::cout << "----------------------------------------" << std::endl;
PRINT_VECTOR(uint16_t,sum2xU64);
#endif
#if !defined(UINTPTR_MAX) || !defined(UINT64_MAX) || !defined(UINT32_MAX)
# error "Limit macros are not defined"
#endif
#if defined PTR_64
return _mm_cvtsi128_si64(sum2xU64);
#elif defined PTR_32
return _mm_cvtsi128_si32(sum2xU64);
#else
# error "macro PTR_(32|64) is not set"
#endif
}
#endif
int main(int argc, char* argv[])
{
std::vector<__m128i> a(64); // * 16 bytes
std::vector<__m128i> b(a.size());
const size_t nBytes = a.size() * sizeof(std::vector<__m128i>::value_type);
char* const a_out = reinterpret_cast<char*>(a.data());
char* const b_out = reinterpret_cast<char*>(b.data());
memset(a_out,0,nBytes);
memset(b_out,0,nBytes);
a_out[1023] = 1;
b_out[1023] = 1;
size_t equalBytes = counteq_epi8(a.data(),b.data(),a.size());
std::cout << "equalBytes = " << equalBytes << std::endl;
return 0;
}
我为大型和小型阵列实现的最快的SSE实现:
size_t counteq_epi8(const __m128i* a_in,const __m128i* b_in,size_t count)
{
assert((count > 0 ? a_in != nullptr : true) && (uintptr_t(a_in) % sizeof(__m128i)) == 0);
assert((count > 0 ? b_in != nullptr : true) && (uintptr_t(b_in) % sizeof(__m128i)) == 0);
//assert(count > 0);
const size_t maxInnerLoops = 255;
const size_t nNestedLoops = count / maxInnerLoops;
const size_t nRemainderLoops = count % maxInnerLoops;
const __m128i zero = _mm_setzero_si128();
__m128i sum16xU8 = zero;
__m128i sum2xU64 = zero;
for(size_t i = 0;i < nNestedLoops;++i)
{
for(size_t j = 0;j < maxInnerLoops;++j)
{
sum16xU8 = _mm_sub_epi8(sum16xU8,_mm_cmpeq_epi8(*a_in++,*b_in++));
}
sum2xU64 = _mm_add_epi64(sum2xU64,_mm_sad_epu8(sum16xU8,zero));
sum16xU8 = zero;
}
for(size_t j = 0;j < nRemainderLoops;++j)
{
sum16xU8 = _mm_sub_epi8(sum16xU8,_mm_cmpeq_epi8(*a_in++,*b_in++));
}
sum2xU64 = _mm_add_epi64(sum2xU64,_mm_sad_epu8(sum16xU8,zero));
sum2xU64 = _mm_add_epi64(sum2xU64,_mm_srli_si128(sum2xU64,64/8));
#if UINTPTR_MAX == UINT64_MAX
return _mm_cvtsi128_si64(sum2xU64);
#elif UINTPTR_MAX == UINT32_MAX
return _mm_cvtsi128_si32(sum2xU64);
#else
# error "macro PTR_(32|64) is not set"
#endif
}
答案 2 :(得分:3)
当前gcc中的自动向量化有助于编译器理解可以很容易地对代码进行矢量化。在您的情况下:如果您删除条件并以更强制的方式重写代码,它将理解向量化请求:
static inline int count(const char* string1, const char* string2, int size) {
int r = 0;
bool b;
for (int j = 0; j < size; ++j) {
b = (string1[j] == string2[j]);
r += b;
}
return r;
}
在这种情况下:
movdqa 16(%rsp), %xmm1
movl $.LC2, %esi
pxor %xmm2, %xmm2
movzbl 416(%rsp), %edx
movdqa .LC1(%rip), %xmm3
pcmpeqb 224(%rsp), %xmm1
cmpb %dl, 208(%rsp)
movzbl 417(%rsp), %eax
movl $1, %edi
pand %xmm3, %xmm1
movdqa %xmm1, %xmm5
sete %dl
movdqa %xmm1, %xmm4
movzbl %dl, %edx
punpcklbw %xmm2, %xmm5
punpckhbw %xmm2, %xmm4
pxor %xmm1, %xmm1
movdqa %xmm5, %xmm6
movdqa %xmm5, %xmm0
movdqa %xmm4, %xmm5
punpcklwd %xmm1, %xmm6
(等)